In [1]:
#Import statements
import pandas as pd
from pathlib import Path

import texthero as hero
from texthero import preprocessing

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
#Load CSV in to main_df variable, set sep=None to make it autorecognize the delimiter, specify engine as python
#CSV was found here: 'https://www.kaggle.com/datasets/kaggle/us-baby-names?resource=download'
main_df = pd.read_csv(Path('resources/NationalNames.csv'), sep=None, engine='python')

In [3]:
#Remove any entries below 500 names to trim the df to something my system can handle
main_df = main_df.drop(main_df[main_df['Count'] < 500].index)

main_df = main_df.reset_index()

#Convert the values of the Gender column from "M" or "F" to "0" or "1"
main_df['Gender'] = main_df['Gender'].str.replace(r'M', '0', regex=True)
main_df['Gender'] = main_df['Gender'].str.replace(r'F', '1', regex=True).astype('int')

#Convert Gender column to int and conver Name column to string
main_df['Gender'] = main_df['Gender'].astype('int')
main_df['Name'] = main_df['Name'].astype('string')

#Drop the Id column (it is not needed)
main_df = main_df.drop(columns=['Id', 'index'])

In [4]:
#Clean the data in the 'Name' column in preparation for converting from text data string to a vectorized token to be used in the model
custom_pipeline = [#preprocessing.fillna,
                   preprocessing.lowercase,
                   preprocessing.remove_whitespace,
                   preprocessing.remove_diacritics,
                   preprocessing.remove_brackets
                  ]
main_df['Name'] = hero.clean(main_df['Name'], custom_pipeline)
main_df['Name'] = [n.replace('{','') for n in main_df['Name']]
main_df['Name'] = [n.replace('}','') for n in main_df['Name']]
main_df['Name'] = [n.replace('(','') for n in main_df['Name']]
main_df['Name'] = [n.replace(')','') for n in main_df['Name']]

In [5]:
#Create a column 'Rank' which ranks each name based on Count, per year. 
# i.e. there is a new set of ranks for each year, starting at rank 1 for the name with the highest count that year.
main_df['Rank'] = (main_df.sort_values(['Year','Count'], ascending=[True,True])
                .groupby(['Year']).cumcount() + 1
             )

In [6]:
#Create variables for the features(X) and the target(y)
X = main_df['Name']
y = main_df['Gender']


In [7]:
#Split the data in to training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

In [8]:

clf = LogisticRegression()

In [9]:
vectorizer = CountVectorizer(ngram_range=(2, 11), analyzer='char', preprocessor=lambda x: f'^{x.lower().replace(" ", "")}$',)
vectorizer = vectorizer.fit(X_train)

In [10]:
pipeline = Pipeline([('vectorizer', vectorizer), ('LogReg', clf)])

pipeline.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(steps=[('vectorizer',
                 CountVectorizer(analyzer='char', ngram_range=(2, 11),
                                 preprocessor=<function <lambda> at 0x0000018BD2863678>)),
                ('LogReg', LogisticRegression())])

In [11]:
predictions = pipeline.predict(X_test)

In [13]:
# Evaluate the model's performance
accuracy = accuracy_score(predictions, y_test)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 97.66%


In [14]:
#Create a dataframe showing predictions vs test
comparison = pd.DataFrame({"Testing Data Predictions": predictions,
    "Testing Data Actual Targets": y_test})
comparison = comparison.sort_index(ascending=True)
comparison.sample(25)

Unnamed: 0,Testing Data Predictions,Testing Data Actual Targets
7322,1,1
52951,1,1
48657,0,0
51510,1,1
77488,0,0
3213,0,0
29081,1,1
40548,1,1
12150,1,1
69175,1,1
