In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('/kaggle/input/gender-classification-dataset/gender_classification_v7.csv')

In [None]:
df.head()

In [None]:
df.shape

# NO NULL VALUES

In [None]:
total_null = df.isnull().sum().sort_values(ascending = False)
percent = ((df.isnull().sum()/df.isnull().count())*100).sort_values(ascending = False)
print("Total records = ", df.shape[0])

missing_data = pd.concat([total_null,percent.round(2)],axis=1,keys=['Total Missing','In Percent'])
missing_data.head(16)


In [None]:
import seaborn as sns
corr = df.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr)
plt.show()

In [None]:
gender_count = df["gender"].value_counts()

plt.bar(gender_count.index, gender_count.values)
plt.title("Gender Ratio")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.show()

In [None]:
long_hair = df[df["long_hair"] == 1]


gender_count = long_hair["gender"].value_counts()


sns.barplot(x=gender_count.index, y=gender_count.values)
plt.title("Gender of people with long hair")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.show()

In [None]:
sns.countplot(x='nose_wide', hue='gender', data=df)

In [None]:
sns.countplot(x='nose_long', hue='gender', data=df)

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
features1=df.copy()

In [None]:
features1['gender'] = (features1['gender'] == 'Male').astype(int)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(features1.drop('gender', axis=1), features1['gender'], test_size=0.2)
lr = LinearRegression()
lr.fit(X_train, Y_train)
Y_pred = lr.predict(X_test)
Y_pred_binary = []

for pred in Y_pred:
    if pred >= 0.5:
        Y_pred_binary.append(1)
    else:
        Y_pred_binary.append(0)

accuracy1 = accuracy_score(Y_test,Y_pred_binary)
accuracy1*100

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
features2=df.copy()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(features2.drop('gender', axis=1), features2['gender'], test_size=0.2)

In [None]:
lor = LogisticRegression()
lor.fit(X_train,Y_train)

In [None]:
Y_pred = lor.predict(X_test)

In [None]:
accuracy2 = accuracy_score(Y_test , Y_pred)
accuracy2*100

# Decision Tree 

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [None]:
features3=df.copy()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(features3.drop('gender', axis=1), features3['gender'], test_size=0.2)

In [None]:
dectree = DecisionTreeClassifier()
dectree.fit(X_train,Y_train)

In [None]:
Y_pred = dectree.predict(X_test)
accuracy3 = accuracy_score(Y_test,Y_pred)
accuracy3*100

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
features4 = df.copy()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(features4.drop('gender', axis=1), features4['gender'], test_size=0.2)

ranfo = RandomForestClassifier()
ranfo.fit(x_train, y_train)

y_pred = ranfo.predict(x_test)

accuracy4 = accuracy_score(y_test, y_pred)
accuracy4*100

# Random Forest using GridSearchCV

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid1 = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10]
}

In [None]:
grid_search = GridSearchCV(estimator=ranfo, param_grid=param_grid1, cv=5)
grid_search.fit(x_train, y_train)

In [None]:
print('Best Hyperparameters:', grid_search.best_params_)
y_pred = grid_search.predict(x_test)
accuracy_5 = accuracy_score(y_test, y_pred)
accuracy_5*100

# KNN

In [None]:
features6=df.copy()

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
X = features6.drop('gender',axis=1)
Y= features6['gender']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,Y_train)

In [None]:
Y_pred = knn.predict(X_test)

In [None]:
accuracy6 = accuracy_score(Y_test,Y_pred)
accuracy6*100

# KNN using GridSearchCV

In [None]:
features7=df.copy()

In [None]:
X = features7.drop('gender',axis=1)
Y = features7['gender']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
param_dist = {'n_neighbors': range(1, 21),
              'weights': ['uniform', 'distance'],
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
              'p': [1, 2]}

In [None]:
grid_search = GridSearchCV(knn,param_dist, cv=5,n_jobs=-1)
grid_search.fit(X_train, Y_train)

In [None]:
print(grid_search.best_params_)

In [None]:
Y_pred = grid_search.predict(X_test)

In [None]:
accuracy7 = accuracy_score(Y_test,Y_pred)
accuracy7t = grid_search.best_score_ 
accuracy7*100


In [None]:
accuracy7t

# Graphs

In [None]:
Algomodels = ['Linear Regression','Logistic Regression','Decision Tree','Random Forest','Random Forest after CV','KNN','KNN after CV']
Accuracies = [accuracy1,accuracy2,accuracy3,accuracy4,accuracy_5,accuracy6,accuracy7t]
plt.figure(figsize=(10,8))
plt.bar(Algomodels,Accuracies)
plt.title('Model Accuracies')
plt.ylabel('Accuracy')
plt.ylim(0,1)
plt.xticks(rotation=90)
plt.show()