In [1]:
%config IPCompleter.greedy=True


import pandas
import matplotlib.pyplot as plt
import json
from sklearn.model_selection import train_test_split


%matplotlib inline

In [2]:
file = './test_data.json'

with open(file) as f:
    json_data = json.load(f)
    
df = pandas.DataFrame.from_dict(
    data=json_data,
    orient='columns'
)

In [3]:
# gender_map = {'m': 0, 'f': 1, 'o': 2}
# df['gender'] = df['gender'].map(gender_map)



In [4]:
df.head()

Unnamed: 0,Q1,Q2,Q3,gender
0,red,history,volleyball,f
1,red,psychology,tennis,f
2,blue,engineering,basketball,m
3,blue,engineering,tennis,m
4,red,history,tennis,f


In [5]:
feature_col_names = ['Q1', 'Q2', 'Q3']
class_col_name = ['gender']

x = pandas.get_dummies(df, columns=feature_col_names).drop(columns=class_col_name).values
y = df[class_col_name].values


In [6]:
x[:5]

array([[0, 1, 0, 1, 0, 0, 0, 1],
       [0, 1, 0, 0, 1, 0, 1, 0],
       [1, 0, 1, 0, 0, 1, 0, 0],
       [1, 0, 1, 0, 0, 0, 1, 0],
       [0, 1, 0, 1, 0, 0, 1, 0]], dtype=uint8)

In [33]:
from sklearn.naive_bayes import GaussianNB

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=4)

nb_model = GaussianNB()

nb_model.fit(X_train, y_train.ravel())


GaussianNB(priors=None, var_smoothing=1e-09)

In [34]:
from sklearn import metrics

nb_predict_train = nb_model.predict(X_train)

print("Accuracy for test: {0:.4f}".format(metrics.accuracy_score(y_train, nb_predict_train)))

Accuracy for test: 0.6957


In [36]:
print(metrics.classification_report(y_train, nb_predict_train))

              precision    recall  f1-score   support

           f       1.00      0.46      0.63        13
           m       0.59      1.00      0.74        10

   micro avg       0.70      0.70      0.70        23
   macro avg       0.79      0.73      0.69        23
weighted avg       0.82      0.70      0.68        23



In [9]:
new_sub = [{
    'Q1': 'blue',
    'Q2': 'engineering',
    'Q3': 'basketball'
}]

In [10]:
df2 = pandas.DataFrame.from_dict(
    data=new_sub,
    orient='columns'
)

df2

Unnamed: 0,Q1,Q2,Q3
0,blue,engineering,basketball


In [11]:
df2_dummies = pandas.get_dummies(df2, columns=feature_col_names).values

df2_dummies

array([[1, 1, 1]], dtype=uint8)

In [12]:
arr = [[0, 1, 1, 0, 0, 1, 0, 0]]
nb_model.predict(arr)

array(['m'], dtype='<U1')

In [94]:
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=42)

rf_model = RandomForestClassifier(class_weight='balanced')

rf_model.fit(X_train, y_train.ravel())




RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=None, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [95]:
rf_predict_train = rf_model.predict(X_train)
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, rf_predict_train)))

Accuracy: 0.9231


In [96]:
rf_predict_train = rf_model.predict(X_test)
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, rf_predict_train)))

Accuracy: 0.7692


In [31]:
print(metrics.classification_report(y_test, rf_predict_train))

              precision    recall  f1-score   support

           f       0.67      0.67      0.67         6
           m       0.71      0.71      0.71         7

   micro avg       0.69      0.69      0.69        13
   macro avg       0.69      0.69      0.69        13
weighted avg       0.69      0.69      0.69        13



In [92]:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=42)

lr_model = LogisticRegression(C=0.6, class_weight="balanced", random_state=42)
lr_model.fit(X_train, y_train.ravel())
lr_predict_test = lr_model.predict(X_test)



In [93]:
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, lr_predict_test)))

Accuracy: 0.9231
