In [2]:
# general use
import os
import scipy as sp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
from sklearn.model_selection import train_test_split

# for evaluation
from statistics import mean
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score

# for current method

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis


In [3]:
data_dirpath = 'dataset'
train_name = 'train.csv'
test_name = 'test.csv'

# Train data set path and test data path
train_path = os.path.join(data_dirpath, train_name)
test_path = os.path.join(data_dirpath, test_name)
train_df = pd.read_csv(train_path, header=[0])
test_df = pd.read_csv(test_path, header=[0])

print(f'[Default] Number of train data: {train_df.shape[0]}, Number of test data: {test_df.shape[0]}')

# Because Lead data type is string, we have to change it into integer format
lead_map = {'Female': 0, 'Male': 1}
train_df['Lead'] = train_df['Lead'].map(lead_map).astype(int)

# Split into input and output.
x_data=train_df.loc[:, train_df.columns != 'Lead']
y_data=train_df.loc[:, train_df.columns == 'Lead']

feature_names = x_data.columns.tolist()
print(feature_names)

[Default] Number of train data: 1039, Number of test data: 387
['Number words female', 'Total words', 'Number of words lead', 'Difference in words lead and co-lead', 'Number of male actors', 'Year', 'Number of female actors', 'Number words male', 'Gross', 'Mean Age Male', 'Mean Age Female', 'Age Lead', 'Age Co-Lead']


In [4]:
# For cross vaidation
# X_train, X_test, y_train, y_test 
X_train1, X_test1, y_train1, y_test1 = train_test_split(x_data, y_data, test_size=0.2, random_state=0)
X_train2, X_test2, y_train2, y_test2 = train_test_split(x_data, y_data, test_size=0.2, random_state=1)
X_train3, X_test3, y_train3, y_test3 = train_test_split(x_data, y_data, test_size=0.2, random_state=2)
X_train4, X_test4, y_train4, y_test4 = train_test_split(x_data, y_data, test_size=0.2, random_state=3)
X_train5, X_test5, y_train5, y_test5 = train_test_split(x_data, y_data, test_size=0.2, random_state=4)

# Train-set
X_trainset = (X_train1, X_train2, X_train3, X_train4, X_train5)
y_trainset = (y_train1, y_train2, y_train3, y_train4, y_train5)

# Test-set
X_testset = (X_test1, X_test2, X_test3, X_test4, X_test5)
y_testset = (y_test1, y_test2, y_test3, y_test4, y_test5)


In [56]:
"""
Evaluate its performance using cross validation.
Exactly how to carry out this evaluation is up to you to decide.
---
Use misclassification error,
false positives, false negatives and ROC / AUC
---
Question
1. Do men or women dominate speaking roles in Hollywood movies?
 남자와 여자 중 누가 헐리우드 영화의 speaking roll을 지배하고 있나

2. Has gender balance in speaking roles changed over time (years)
 시간이 지날수록 speaking role의 균형은 맞춰지고 있나?

3. Do films in which men do more speaking make a lot more money than films in which women speak more?
 남자가 speaking role을 많이 가져갔을 때 돈을 많이 벌었을까 아님 반대일까?

Lead = 주연
co-lead = 남자일경우 여자 and vice versa

"""
# LDA (Linear Discriminent Analysis)
model = LinearDiscriminantAnalysis()

# Number words female, Number words male / Year / Gross.
# Feature 1 include all the importance features in Input

# Features
"""
'Number words female',
'Total words',
'Number of words lead',
'Difference in words lead and co-lead',
'Number of male actors',
'Year',
'Number of female actors',
'Number words male',
'Gross',
'Mean Age Male',
'Mean Age Female',
'Age Lead',
'Age Co-Lead'

"""

FeatureList = [
'Number words female',
'Total words',
'Number of words lead',
'Difference in words lead and co-lead',
'Number of male actors',
'Year',
'Number of female actors',
'Number words male',
'Gross',
'Mean Age Male',
'Mean Age Female',
'Age Lead',
'Age Co-Lead'
]
ResultFeature = 'Lead'

accuracy_list = []
recall_list = []
precision_list = []

# print(X_trainset[1][np.array(FeatureList)])
for X_train, X_test, y_train, y_test in zip(X_trainset, X_testset, y_trainset, y_testset):
    # fit model
    model.fit(X_train[np.array(FeatureList)], y_train[ResultFeature]) 
    
    # predict the test with model
    predict_prob = model.predict_proba(X_test[np.array(FeatureList)])
    prediction = np.empty(len(X_test[np.array(FeatureList)]), dtype = object)
    prediction = np.where(predict_prob[:,0] >= 0.5, 0,1)
    
    # Confustion matrix
    crossTab = pd.crosstab(prediction, y_test[ResultFeature])

    TP = crossTab.loc[0,0]  # True Positive, Female, Female
    TN = crossTab.loc[1,1]  # True Negative, Male, Male
    FP = crossTab.loc[1,0]  # False Positive, Male, Female
    FN = crossTab.loc[0,1]  # False Negative, Female, Male

    # Recall and Precision
    print(f"Recall : {TP / (TP+FN):.3f}")
    print(f"Precision : {TP / (TP+FP):.3f}")

    # Accuracy 
    print(f"Accuracy : {np.mean(prediction == y_test[ResultFeature]):.3f}\n")
    
    recall_list.append(TP / (TP+FN))
    precision_list.append(TP / (TP+FP))
    accuracy_list.append(np.mean(prediction == y_test[ResultFeature]))

print('-------------------------------')
print(f"Mean of Recall : {np.mean(recall_list):.3f}")
print(f"Mean of Precision : {np.mean(precision_list):.3f}")
print(f"Mean of Accuracy : {np.mean(accuracy_list):.3f}")



Recall : 0.878
Precision : 0.554
Accuracy : 0.837

Recall : 0.725
Precision : 0.707
Accuracy : 0.889

Recall : 0.867
Precision : 0.481
Accuracy : 0.846

Recall : 0.805
Precision : 0.660
Accuracy : 0.880

Recall : 0.810
Precision : 0.576
Accuracy : 0.841

-------------------------------
Mean of Recall : 0.817
Mean of Precision : 0.596
Mean of Accuracy : 0.859


In [58]:

# QDA (Quadratic Discriminant Analysis)
model = QuadraticDiscriminantAnalysis()

FeatureList = [
'Number words female',
'Number words male',
'Number of words lead',
'Difference in words lead and co-lead',
'Year',
'Number of female actors',
'Number of male actors',
'Age Lead',
'Age Co-Lead'
]
ResultFeature = 'Lead'

accuracy_list = []
recall_list = []
precision_list = []

# print(X_trainset[1][np.array(FeatureList)])
for X_train, X_test, y_train, y_test in zip(X_trainset, X_testset, y_trainset, y_testset):
    # fit model
    model.fit(X_train[np.array(FeatureList)], y_train[ResultFeature]) 
    
    # predict the test with model
    predict_prob = model.predict_proba(X_test[np.array(FeatureList)])
    prediction = np.empty(len(X_test[np.array(FeatureList)]), dtype = object)
    prediction = np.where(predict_prob[:,0] >= 0.5, 0,1)
    
    # Confustion matrix
    crossTab = pd.crosstab(prediction, y_test[ResultFeature])

    TP = crossTab.loc[0,0]  # True Positive, Female, Female
    TN = crossTab.loc[1,1]  # True Negative, Male, Male
    FP = crossTab.loc[1,0]  # False Positive, Male, Female
    FN = crossTab.loc[0,1]  # False Negative, Female, Male

    # Recall and Precision
    print(f"Recall : {TP / (TP+FN):.3f}")
    print(f"Precision : {TP / (TP+FP):.3f}")

    # Accuracy 
    print(f"Accuracy : {np.mean(prediction == y_test[ResultFeature]):.3f}\n")
    
    recall_list.append(TP / (TP+FN))
    precision_list.append(TP / (TP+FP))
    accuracy_list.append(np.mean(prediction == y_test[ResultFeature]))

print('-------------------------------')
print(f"Mean of Recall : {np.mean(recall_list):.3f}")
print(f"Mean of Precision : {np.mean(precision_list):.3f}")
print(f"Mean of Accuracy : {np.mean(accuracy_list):.3f}")



Recall : 0.852
Precision : 0.800
Accuracy : 0.894

Recall : 0.702
Precision : 0.805
Accuracy : 0.894

Recall : 0.826
Precision : 0.704
Accuracy : 0.885

Recall : 0.864
Precision : 0.760
Accuracy : 0.913

Recall : 0.782
Precision : 0.729
Accuracy : 0.865

-------------------------------
Mean of Recall : 0.805
Mean of Precision : 0.759
Mean of Accuracy : 0.890
