Adatbázis eléretőségek: 

- https://www.kaggle.com/miroslavsabo/young-people-survey
- https://www.kaggle.com/datasnaek/mbti-type
- https://www.kaggle.com/vitaliymalcev/personal-distance-in-42-countries
- https://www.kaggle.com/lucasgreenwell/depression-anxiety-stress-scales-responses

Stat háttér

- https://www.statokos.com/nemparamteresprobak 
- https://machinelearningmastery.com/a-gentle-introduction-to-normality-tests-in-python/</br>

Két fajta megközelítés:

1. Hipotézisvizsgálat - Nonparametrikus próbával vizsgáljuk, hogy a változók mentén megmutatkozik-e különbség a nemi csoportok között

2. Gépi tanulás - Gépi tanuló algoritmusokkal vizsgáljuk meg, hogy a nem, mint klasszifikációs cimke, meghatározható-e a magyarázó változók segítségével

# Modulok importálása

In [None]:
!python -V

In [1]:
# Modulok

import pandas as pd
import numpy as np

#Vizualizáció

import matplotlib.pyplot as plt

# Normalitás vizsgálat

from scipy.stats import shapiro
from scipy.stats import zscore, boxcox

# MANOVA

from statsmodels.multivariate.manova import MANOVA

# Mann-Whitney U

from scipy.stats import mannwhitneyu

# ML

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm

from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, roc_curve, roc_auc_score, matthews_corrcoef, f1_score, precision_score

# WARNING Off

import warnings
warnings.simplefilter("ignore")

# Adatok beolvasása

In [2]:
data = pd.read_csv('./data/responses.csv')

In [111]:
len(data)

1010

# Filmpreferencia nemi különbségei

## Adatok kiválasztása és előkészítése

In [112]:
data_mov = data.iloc[:, 20:31]
data_mov['Gender'] = data['Gender']

In [113]:
for i in range(0, len(data_mov)):
    if data_mov['Gender'][i] == 'male':
        data_mov['Gender'][i] = 1       
    else:
        data_mov['Gender'][i] = 2

In [27]:
data_mov = data_mov.dropna()

In [28]:
for i in range(0, len(data_mov.columns)):
    data_mov[data_mov.columns[i]] = data_mov[data_mov.columns[i]].astype('int')

In [109]:
data_mov['Gender'].value_counts(normalize = True)

2    0.595918
1    0.404082
Name: Gender, dtype: float64

## Normalitás vizsgálat

In [74]:
# Adatbázis előkészítése

norm_test = pd.DataFrame()

norm_test['Var'] = data_mov.columns[0:11]
norm_test['Stat'] = range(0, 11)
norm_test['Stat'] = norm_test['Stat'].astype('float')
norm_test['p_value'] = range(0, 11)
norm_test['p_value'] = norm_test['p_value'].astype('float')

In [76]:
# Shapiro-Wilk teszt

for i in range(0, len(data_mov.columns)):
    stat, p = shapiro(data_mov.iloc[:, i])
    print('Statistics=%.3f, p=%.3f' % (stat, p))
    norm_test['Stat'][i] = stat
    norm_test['p_value'][i] = p

Statistics=0.883, p=0.000
Statistics=0.902, p=0.000
Statistics=0.672, p=0.000
Statistics=0.893, p=0.000
Statistics=0.904, p=0.000
Statistics=0.896, p=0.000
Statistics=0.860, p=0.000
Statistics=0.845, p=0.000
Statistics=0.885, p=0.000
Statistics=0.840, p=0.000
Statistics=0.886, p=0.000
Statistics=0.623, p=0.000


In [78]:
norm_test.to_excel('mov_norm_test.xlsx')

## Mann-Whitney

In [80]:
data_mov_male = data_mov[data_mov['Gender'] == 1]
data_mov_female = data_mov[data_mov['Gender'] == 2]

In [96]:
# Adatbázis előkészítése

MW_test = pd.DataFrame()

MW_test['Var'] = data_mov.columns[0:11]
MW_test['Stat'] = range(0, 11)
MW_test['Stat'] = MW_test['Stat'].astype('float')
MW_test['p_value'] = range(0, 11)
MW_test['p_value'] = MW_test['p_value'].astype('float')
MW_test['Hyp'] = range(0, 11)

In [97]:
# Mann-Whitney teszt

for i in range(0, len(data_mov.columns)):
    stat, p = mannwhitneyu(data_mov_male.iloc[:, i], data_mov_female.iloc[:, i])

    print('Statistics=%.3f, p=%.3f' % (stat, p))
    MW_test['Stat'][i] = stat
    MW_test['p_value'][i] = p

    # interpret
    alpha = 0.05
    if p > alpha:
        MW_test['Hyp'][i] = "nullhipotézis elfogadva"
    else:
        MW_test['Hyp'][i] = "alternatív hipotézis elfogadva"

Statistics=93936.500, p=0.000
Statistics=90484.000, p=0.000
Statistics=113590.500, p=0.290
Statistics=57688.000, p=0.000
Statistics=73344.500, p=0.000
Statistics=59857.500, p=0.000
Statistics=77706.000, p=0.000
Statistics=89806.500, p=0.000
Statistics=99225.000, p=0.000
Statistics=68718.500, p=0.000
Statistics=60498.500, p=0.000
Statistics=0.000, p=0.000


In [99]:
MW_test.to_excel('MW_test_eredmeny.xlsx')

In [104]:
data_mov.iloc[:, 0:10].groupby(data_mov['Gender']).mean()

Unnamed: 0_level_0,Movies,Horror,Thriller,Comedy,Romantic,Sci-fi,War,Fantasy/Fairy tales,Animated,Documentary
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,4.587786,3.061069,3.661578,4.483461,2.847328,3.615776,3.824427,3.340967,3.503817,3.80916
2,4.621343,2.600688,3.187608,4.509466,3.929432,2.771084,2.681583,4.030981,3.969019,3.528399


## Gépi tanulás alkalmazása

In [37]:
model_vars = data_mov.drop(columns = 'Gender')
target = data_mov.iloc[:, -1]

In [38]:
x_train, x_test, y_train, y_test = train_test_split(model_vars, target, test_size=0.3, random_state=1)

In [118]:
len(x_train), len(x_test)

(686, 294)

In [39]:
# Log. reg. algoritmus

predicted_var = y_train


model = LogisticRegression(max_iter = 1000)
model.fit(x_train, predicted_var)

predictions = model.predict(x_test)

results = confusion_matrix(y_test, predictions)

print('Confusion Matrix :')
print(results) 
print('Accuracy Score :',accuracy_score(y_test, predictions))
print('Matthews corcoef :', matthews_corrcoef(y_test, predictions))
print('Recall Score :', recall_score(y_test, predictions))
print('F1 Score:', f1_score(y_test, predictions, average='macro'))
print('Precision:', precision_score(y_test, predictions, average='macro'))
print('ROC_AUC:', roc_auc_score(y_test, predictions))

Confusion Matrix :
[[ 93  26]
 [ 34 141]]
Accuracy Score : 0.7959183673469388
Matthews corcoef : 0.5818865836085587
Recall Score : 0.7815126050420168
F1 Score: 0.7903294822421909
Precision: 0.788297420906219
ROC_AUC: 0.7936134453781513


In [119]:
y_test.value_counts()

2    175
1    119
Name: Gender, dtype: int64

In [40]:
sorted(zip(model.coef_[0], x_train.columns), key=lambda x: abs(x[0]), reverse=True)[:]

[(0.9495032601232561, 'Romantic'),
 (-0.5770147173515029, 'Action'),
 (-0.5083065567865432, 'War'),
 (-0.4704384508126307, 'Western'),
 (0.4260766063129754, 'Fantasy/Fairy tales'),
 (-0.39881315895683506, 'Comedy'),
 (-0.3577358216085411, 'Sci-fi'),
 (0.15789319889825384, 'Animated'),
 (0.10104705355234803, 'Thriller'),
 (0.06209479928814304, 'Documentary'),
 (-0.05854447966710079, 'Horror')]

In [41]:
# Random forest

predicted_var = y_train


model = RandomForestClassifier(n_estimators = 200, max_depth = 9, random_state = 1, class_weight = 'balanced')
# model = ExtraTreesClassifier(n_estimators = 10)

model.fit(x_train, predicted_var)

predictions = model.predict(x_test)

results = confusion_matrix(y_test, predictions)

print('Confusion Matrix :')
print(results) 
print('Accuracy Score :',accuracy_score(y_test, predictions))
print('Matthews corcoef :', matthews_corrcoef(y_test, predictions))
print('Recall Score :', recall_score(y_test, predictions))
print('F1 Score:', f1_score(y_test, predictions, average='macro'))
print('Precision:', precision_score(y_test, predictions, average='weighted'))
print('ROC_AUC:', roc_auc_score(y_test, predictions))

Confusion Matrix :
[[ 88  31]
 [ 32 143]]
Accuracy Score : 0.7857142857142857
Matthews corcoef : 0.555905051212425
Recall Score : 0.7394957983193278
F1 Score: 0.7779429571639231
Precision: 0.786015325670498
ROC_AUC: 0.7783193277310925


In [42]:
importances = model.feature_importances_
df_importance =pd.DataFrame()
df_importance['imp'] = importances
df_importance['colnames'] = x_train.columns

In [43]:
df_importance.sort_values(by=['imp'], ascending = False).head(10)

Unnamed: 0,imp,colnames
3,0.179813,Romantic
10,0.134886,Action
5,0.134049,War
9,0.105694,Western
4,0.091491,Sci-fi
6,0.08175,Fantasy/Fairy tales
0,0.062538,Horror
1,0.059711,Thriller
7,0.059454,Animated
8,0.051249,Documentary


In [44]:
# XG Boost

predicted_var = y_train


model = GradientBoostingClassifier(n_estimators=100, learning_rate = 0.1, 
                                   max_features = None, max_depth = 4, random_state = 0)

model.fit(x_train, predicted_var)


pred_raw = model.predict(x_test)
predictions = model.predict(x_test).round()

results = confusion_matrix(y_test, predictions)

print('Confusion Matrix :')
print(results) 
print('Accuracy Score :',accuracy_score(y_test, pred_raw))
print('Matthews corcoef :', matthews_corrcoef(y_test, predictions))
print('Recall Score :', recall_score(y_test, predictions))
print('F1 Score:', f1_score(y_test, predictions, average='macro'))
print('Precision:', precision_score(y_test, predictions, average='weighted'))

Confusion Matrix :
[[ 90  29]
 [ 33 142]]
Accuracy Score : 0.7891156462585034
Matthews corcoef : 0.5649169083082547
Recall Score : 0.7563025210084033
F1 Score: 0.7823054507237376
Precision: 0.7904585311517275


In [103]:
# SVM

predicted_var = y_train

model = svm.SVC(kernel='linear')

model.fit(x_train, predicted_var)


pred_raw = model.predict(x_test)
predictions = model.predict(x_test).round()

results = confusion_matrix(y_test, predictions)

print('Confusion Matrix :')
print(results) 
print('Accuracy Score :',accuracy_score(y_test, pred_raw))
print('Matthews corcoef :', matthews_corrcoef(y_test, predictions))
print('Recall Score :', recall_score(y_test, predictions))
print('F1 Score:', f1_score(y_test, predictions, average='macro'))
print('Precision:', precision_score(y_test, predictions, average='weighted'))

Confusion Matrix :
[[ 92  27]
 [ 32 143]]
Accuracy Score : 0.7993197278911565
Matthews corcoef : 0.5866711652575564
Recall Score : 0.773109243697479
F1 Score: 0.7930935766684558
Precision: 0.8010074997741032
