# Base Model

 - Logistic Regression + CountVectorizer
 - Logistic Regression + TFIDF


In [63]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression

In [64]:
data = pd.read_csv("/content/malayalam-names.csv")
print(data.shape)
data.head()

(22, 2)


Unnamed: 0,Name,Label
0,അദിതി,0
1,ആക്ഷയാ,0
2,ബാബു,1
3,മധു,1
4,ആദിൽ,1


In [65]:
data["Label"].value_counts()

0    11
1    11
Name: Label, dtype: int64

In [66]:
# Preparing the data 


X = data["Name"]
Y = data["Label"]

In [67]:
# splitting the dataset training and testing

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=42) 

## Using Word Tokenizer & logistic regression

In [68]:
# Vectorizing the text data

cv = CountVectorizer()

text_train = cv.fit_transform(X_train)
text_test = cv.transform(X_test)


# Training the model using logistic regression
lr_cv = LogisticRegression()
lr_cv.fit(text_train, y_train)

LogisticRegression()

In [69]:
y_pred = lr_cv.predict(text_test)
ac = accuracy_score(y_test, y_pred)*100

print("Accuracy is", ac)

Accuracy is 60.0


In [70]:
# sample prediction

def gender_predictor(name):
  
    test_name = [name]
    vector = cv.transform(test_name).toarray()
    if lr_cv.predict(vector) == 0:
        print("Female")
    else:
        print("Male")

In [71]:
print(gender_predictor("ആശ"), "\n")
print(gender_predictor("വൈശാഖ്"))

Female
None 

Female
None


## TF-IDF Vectorizer & logistic regression

In [72]:
# Vectorizing the text data

tf = TfidfVectorizer()

text_train = tf.fit_transform(X_train)
text_test = tf.transform(X_test)


# Training the model using logistic regression
lr_tf = LogisticRegression()
lr_tf.fit(text_train, y_train)

LogisticRegression()

In [73]:
y_pred = lr_tf.predict(text_test)
ac = accuracy_score(y_test, y_pred)*100

print("Accuracy is", ac)

Accuracy is 40.0


In [74]:
# sample prediction

def gender_predictor(name):
  
    test_name = [name]
    vector = tf.transform(test_name).toarray()
    if lr_tf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")

In [75]:
print(gender_predictor("ആശ"), "\n")
print(gender_predictor("വൈശാഖ്"))

Male
None 

Male
None


We haven't used any complex fine tuning, label_encoding techniques and even a proper dataset at the moment.As the project moving on, we may able to improve the perfomance.