In [1]:
# general use
import os
import csv
import pickle
import scipy as sp
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
from sklearn.model_selection import train_test_split

# save numpy array as csv file
from numpy import asarray
from numpy import savetxt

# for evaluation
from numpy import mean, std
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score

# for current method
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
data_dirpath = 'dataset'
train_name = 'train.csv'
test_name = 'test.csv'

train_path = os.path.join(data_dirpath, train_name)
test_path = os.path.join(data_dirpath, test_name)
train_df = pd.read_csv(train_path, header=[0])
test_df = pd.read_csv(test_path, header=[0])

print(f'[Default] Number of train data: {train_df.shape[0]}, Number of test data: {test_df.shape[0]}')

train_df.head()

# on train set
# lead_map = {'Female': 0, 'Male': 1}
# train_df['Lead'] = train_df['Lead'].map(lead_map).astype(int)

# on test set
train_df = test_df


[Default] Number of train data: 1039, Number of test data: 387


In [3]:
# G1
train_df["Male/Female Actors Ratio"] = (train_df['Number of male actors']+1) /(train_df['Number of female actors']+1)
train_df["Log Male/Female Actors Ratio"] = np.log(train_df["Male/Female Actors Ratio"])
train_df["Total Actors"] = train_df['Number of male actors']+train_df['Number of female actors']
# train_df["LogGross"] = np.log(train_df["Gross"])
# train_df.drop(['Male/Female Actors Ratio', 'Number of male actors', 'Number of female actors', 'Total Actors'], axis=1, inplace=True)


# C2
# train set
if 'Lead' in train_df.columns:
    train_df.loc[train_df['Lead'] == 'Male', 'Total Male Words'] = train_df['Number words male'] + train_df['Number of words lead']
    train_df.loc[train_df['Lead'] != 'Male', 'Total Male Words'] = train_df['Total words'] - (train_df['Number words female'] + train_df['Number of words lead'])

    train_df.loc[train_df['Lead'] == 'Female', 'Total Female Words'] = train_df['Number words female'] + train_df['Number of words lead']
    train_df.loc[train_df['Lead'] != 'Female', 'Total Female Words'] = train_df['Total words'] - (train_df['Number words male'] + train_df['Number of words lead'])
    # train_df["Total Male/Female Words Ratio"] = train_df["Total Male Words"] / train_df["Total Female Words"]
else:
    # on test set
    train_df['Total Male Words'] = train_df['Number words male']
    train_df['Total Female Words'] = train_df['Number words female']

train_df.drop(['Number words female', 'Number words male'], axis=1, inplace=True)

# W1
train_df['Words per Male Actor'] = train_df['Total Male Words'] / train_df['Number of male actors']
train_df['Words per Female Actor'] = train_df['Total Female Words'] / train_df['Number of female actors']


# W2
train_df['Log Words per Male Actor'] = np.log(train_df['Words per Male Actor'])
train_df['Log Words per Female Actor'] = np.log(train_df['Words per Female Actor'])

train_df.drop(['Words per Male Actor', 'Words per Female Actor'], axis=1, inplace=True)
train_df.drop(['Male/Female Actors Ratio', 'Number of male actors', 'Number of female actors', 'Total Actors'], axis=1, inplace=True)


# train_df['Log Total Male Words'] = np.log(train_df['Total Male Words'])
# train_df['Log Total Female Words'] = np.log(train_df['Total Female Words'])
# train_df["Log Total Male/Female Words Ratio"] = np.log(train_df["Total Male/Female Words Ratio"])

# train_df['Lead Word Dominance'] = train_df['Number of words lead'] / train_df['Total words']

# train_df["Total Male Words Percentage"] = train_df["Total Male Words"]/train_df["Total words"]
# train_df["Total Female Words Percentage"] = train_df["Total Female Words"]/train_df["Total words"]
# train_df["Total Female Words Percentage"] = 1 - train_df["Total Male Words Percentage"]



  result = getattr(ufunc, method)(*inputs, **kwargs)


In [4]:
x_data = train_df
y_data = None

if 'Lead' in x_data.columns:
    x_data=train_df.loc[:, train_df.columns != 'Lead']
    y_data=train_df['Lead']

In [5]:
################################################
#                Test features                 #
################################################
# A
x_data["YearXGross"] = (x_data["Year"]) * x_data["Gross"]

# TW1
x_data["Other total words"] = x_data["Total words"] - x_data['Number of words lead']

# E1
# drop more data
x_data.drop(['Mean Age Male', 'Mean Age Female'], axis=1, inplace=True)

# E2
# drop more data
x_data.drop(['Total words'], axis=1, inplace=True)
x_data.drop(['Number of words lead'], axis=1, inplace=True)

# E3 decrease
# x_data.drop(['Number of male actors', 'Number of female actors'], axis=1, inplace=True)

# E4 
x_data.drop(['Age Lead', 'Age Co-Lead'], axis=1, inplace=True)

##################################################
#                Test Combinations
##################################################
# A+G1+C2+W1+W2+TW1+E1+E2+E4 
# depth=3 / learning_rate=0.2 / n_estimators=700
# Mean Accuracy: 0.913
# Mean Recall: 0.944
# Mean Precision: 0.940
# Mean F1: 0.941

# A+G1+C2+W2+TW1+E1+E2+E4 => W2 is better
# depth=3 / learning_rate=0.2 / n_estimators=700
# Mean Accuracy: 0.914
# Mean Recall: 0.944
# Mean Precision: 0.946
# Mean F1: 0.942

##################################################
##################################################

feature_names = x_data.columns.tolist()
x_data.head()

Unnamed: 0,Difference in words lead and co-lead,Year,Gross,Log Male/Female Actors Ratio,Total Male Words,Total Female Words,Log Words per Male Actor,Log Words per Female Actor,YearXGross,Other total words
0,2241,2005,46.0,0.81093,2155,669,5.596104,5.407172,92230.0,2824.0
1,1186,2001,278.0,1.098612,1960,161,5.971262,5.081404,556278.0,2121.0
2,2231,2008,53.0,1.15268,10174,1720,6.337219,5.840642,106424.0,11894.0
3,9912,2001,81.0,0.693147,10384,3342,6.683072,6.322565,162081.0,13726.0
4,2858,1992,131.0,1.252763,8778,1520,7.288244,7.326466,260952.0,10298.0


In [6]:
print(f'x_data.isnull = \n{x_data.isnull().sum()}')


count = np.isinf(x_data).values.sum()
print("\nBefore It contains " + str(count) + " infinite values")
x_data.replace([np.inf, -np.inf], 9, inplace=True)
count = np.isinf(x_data).values.sum()
print("\nAfter It contains " + str(count) + " infinite values")

x_data.isnull = 
Difference in words lead and co-lead    0
Year                                    0
Gross                                   0
Log Male/Female Actors Ratio            0
Total Male Words                        0
Total Female Words                      0
Log Words per Male Actor                0
Log Words per Female Actor              0
YearXGross                              0
Other total words                       0
dtype: int64

Before It contains 16 infinite values

After It contains 0 infinite values


In [7]:
filename = 'finalized_model2.sav'
model = pickle.load(open(filename, 'rb'))
y_pred = model.predict(x_data)

if y_data is not None:
    accuracy = accuracy_score(y_data, y_pred)
    recall = recall_score(y_data, y_pred)
    precision = precision_score(y_data, y_pred)
    f1 = f1_score(y_data, y_pred)
    cm = confusion_matrix(y_data, y_pred)
    print(f'accuracy={accuracy}, recall={recall}, precision={precision}, recall={recall}, f1={f1}, cm={cm}')

In [8]:
y_pred = ['0' if x ==1 else '1' for x in y_pred]
pred_content = ','.join(y_pred)
with open('predictions.csv', 'w') as file:
    file.write(pred_content)

In [9]:
print(y_pred)


['0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '1', '1', '0', '0', '1', '0', '0', '0', '0', '0', '1', '0', '1', '0', '1', '0', '1', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '1', '0', '0', '1', '1', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '0', '1', '1', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '1', '0', '0', '0', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',