In [4]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer

In [6]:

male_df = pd.read_csv("Indian-Female-Names.csv")
female_df = pd.read_csv("Indian-Male-Names.csv")

male_df["gender"] = "male"
female_df["gender"] = "female"

df = pd.concat([male_df, female_df], ignore_index=True)

# Make sure names are in lowercase
df["name"] = df["name"].str.lower().str.strip()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30227 entries, 0 to 30226
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    30172 non-null  object
 1   gender  30227 non-null  object
 2   race    30227 non-null  object
dtypes: object(3)
memory usage: 708.6+ KB


In [12]:


# Drop missing or null names
df = df.dropna(subset=["name"])

# Make sure all names are strings
df["name"] = df["name"].astype(str).str.lower().str.strip()
print(df.isna().count())
print(df["gender"].unique())

name      30172
gender    30172
race      30172
dtype: int64
['male' 'female']


In [13]:
def extract_features(name):
    return {
        "last_char": name[-1:],
        "last_2": name[-2:],
        "last_3": name[-3:]
    }

features = df["name"].apply(extract_features)
features_df = pd.DataFrame(features.tolist())

# Vectorize the features

vec = DictVectorizer(sparse=False)
X = vec.fit_transform(features_df.to_dict(orient="records"))

y = df["gender"]


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))


Validation Accuracy: 0.8631317315658658


In [None]:
# Load test data
test_df = pd.read_csv("test.csv")
test_df["name"] = test_df["name"].str.lower().str.strip()

# Extract features
test_features = test_df["name"].apply(extract_features)
test_features_df = pd.DataFrame(test_features.tolist())

# Vectorize using the same vectorizer
X_test = vec.transform(test_features_df.to_dict(orient="records"))

# Predict
test_df["predicted_gender"] = model.predict(X_test)

# Evaluate
from sklearn.metrics import classification_report

print(classification_report(test_df["gender"], test_df["predicted_gender"]))
