In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
file_path = '/content/drive/My Drive/name_gender_dataset.csv'

df = pd.read_csv(file_path)

print(df.head(10))



Mounted at /content/drive
      Name Gender    Count  Probability
0    James      M  5304407     0.014517
1     John      M  5260831     0.014398
2   Robert      M  4970386     0.013603
3  Michael      M  4579950     0.012534
4  William      M  4226608     0.011567
5     Mary      F  4169663     0.011411
6    David      M  3787547     0.010366
7   Joseph      M  2695970     0.007378
8  Richard      M  2638187     0.007220
9  Charles      M  2433540     0.006660


In [None]:
print(df.head())
print(df.columns)
print(df.info())


      Name Gender    Count  Probability
0    James      M  5304407     0.014517
1     John      M  5260831     0.014398
2   Robert      M  4970386     0.013603
3  Michael      M  4579950     0.012534
4  William      M  4226608     0.011567
Index(['Name', 'Gender', 'Count', 'Probability'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147269 entries, 0 to 147268
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Name         147269 non-null  object 
 1   Gender       147269 non-null  object 
 2   Count        147269 non-null  int64  
 3   Probability  147269 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 4.5+ MB
None


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer

In [None]:
# Function to extract properties from names
def gender_features(name):
    return {
        'last_letter': name[-1].lower(),        # Last letter
        'name_length': len(name),               # Name length
    }

# Feature extraction and labeling
featuresets = [(gender_features(name), gender) for name, gender in zip(df['Name'], df['Gender'])]

# Let's divide it into training and test sets
train_set, test_set = train_test_split(featuresets, test_size=0.3, random_state=42)

# Digitizing features with DictVectorizer
vectorizer = DictVectorizer(sparse=False)
X_train = vectorizer.fit_transform([features for features, _ in train_set])
y_train = [gender for _, gender in train_set]
X_test = vectorizer.transform([features for features, _ in test_set])
y_test = [gender for _, gender in test_set]


In [None]:
# Training the Naive Bayes
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

# Making predictions with the test set
y_pred_nb = nb_classifier.predict(X_test)

# Calculating model accuracy
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Naive Bayes Doğruluğu: {accuracy_nb * 100:.2f}%")


Naive Bayes Doğruluğu: 70.52%


In [None]:
# Training the Decision Trees model
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

# Making predictions with the test set
y_pred_dt = dt_classifier.predict(X_test)

# Calculating model accuracy
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Karar Ağaçları Doğruluğu: {accuracy_dt * 100:.2f}%")


Karar Ağaçları Doğruluğu: 74.92%


In [None]:
# Let's compare the results
print(f"\nNaive Bayes Doğruluğu: {accuracy_nb * 100:.2f}%")
print(f"Karar Ağaçları Doğruluğu: {accuracy_dt * 100:.2f}%")



Naive Bayes Doğruluğu: 70.52%
Karar Ağaçları Doğruluğu: 74.92%
