In [1]:
import pandas as pd
import numpy as np

#Plotting
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# ML Libraries

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Boosting classifiers
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

# Save model
import pickle
import joblib

In [2]:
# fileLocation = 'C:\\Users\\VictorY\\Desktop\\TestData\\stage3FinalProcessedDF_Nov-17-2019.csv'
# moviesDataFileLocation = 'C:\\Yuva\\ITU\\4th Sem\\Thesis\\Data\\Final_Data_Movies_Directors.csv'
moviesDataFileLocation = 'C:\\Yuva\\ITU\\4th Sem\\Thesis\\Data\\Movies_director_data_3709_119Features.csv'
saveFileToPath = "C:\\Yuva\\ITU\\4th Sem\\Thesis\\Data\\"

movies_data = pd.read_csv(moviesDataFileLocation)

In [3]:
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3709 entries, 0 to 3708
Columns: 119 entries, director_ids to Gender
dtypes: float64(6), int64(107), object(6)
memory usage: 3.4+ MB


In [4]:
movies_data

Unnamed: 0,director_ids,director_names,movie_id,title,review_count_user,review_count_critic,metascore,rating_value,rating_count,release_date,...,title_subcategory_TV_short,title_subcategory_movie,title_subcategory_video,birthYear,director_profession_primary,director_profession_secondary,director_profession_tertiary,director_known_titles_count,director_known_titles_average_rating_value,Gender
0,nm0000485,Fritz Lang,tt0017136,Metropolis (1927) - IMDb,418,263,98,8.3,114917,5 March 1927 (Denmark) See more »,...,0,1,0,1890,1,0,0,8,7.93,Male
1,nm0000122,Charles Chaplin,tt0027977,Les temps modernes (1936) - IMDb,213,119,96,8.6,147990,24 September 1936 (France) See more »,...,0,1,0,1889,0,0,0,4,8.30,Male
2,nm0281808,Victor Fleming,tt0032138,"Óz, a csodák csodája (1939) - IMDb",540,213,100,8.1,299644,21 March 1940 (Hungary) See more »,...,0,1,0,1889,1,0,0,4,7.55,Male
3,nm0002030,George Cukor,tt0032138,"Óz, a csodák csodája (1939) - IMDb",540,213,100,8.1,299644,21 March 1940 (Hungary) See more »,...,0,1,0,1899,1,0,0,4,7.60,Male
4,nm0003506,James Mangold,tt0035423,Kate & Leopold (2001) - IMDb,317,124,44,6.4,65765,25 December 2001 (USA) See more »,...,0,1,0,1963,0,1,0,4,7.45,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3704,nm1503575,Barry Jenkins,tt4975722,Moonlight (2016) - IMDb,75,159,99,8.7,7304,18 November 2016 (USA) See more »,...,0,1,0,1979,1,0,0,3,7.67,Male
3705,nm2480587,Charles Ferguson,tt5001130,Time to Choose (2015) - IMDb,3,7,75,7.2,109,3 June 2016 (USA) See more »,...,0,1,0,1955,0,1,0,6,7.93,Male
3706,nm2207625,Kleber Mendonça Filho,tt5221584,Aquarius (2016) - IMDb,11,77,87,7.8,8101,28 September 2016 (France) See more »,...,0,1,0,1968,1,0,0,3,7.23,Male
3707,nm1347153,Tyler Perry,tt5325452,Boo! A Madea Halloween (2016) - IMDb,32,28,30,4.6,2421,21 October 2016 (USA) See more »,...,0,1,0,1969,1,0,0,8,5.75,Male


In [5]:
def balance_gender(dataframe):
    female_list_unique_directors = dataframe[(dataframe['Gender'] == 'Female')].director_ids.unique().tolist()
    
    unique_male_directors = dataframe[(dataframe['Gender'] == 'Male')].director_ids.unique().tolist()
    
    male_list_unique_directors = np.random.choice(unique_male_directors, replace = False, size = len(female_list_unique_directors)).tolist()
    
    total_list_unique_directors = female_list_unique_directors + male_list_unique_directors

    dataframe = dataframe[dataframe['director_ids'].isin(np.array(total_list_unique_directors).tolist())]

    return dataframe

In [6]:
# Generic function to equally max split the dataframe with respect to the column Name, in our case the Gender column.

def max_equal_split_on_columnName(df,columnName='Gender'):
    df1 = pd.DataFrame()
    for i in df[columnName].value_counts().index:
        df1 = df1.append(df[(df[columnName]==i)].sample(n=df[columnName].value_counts().values.min(),replace = False))        
    return df1.reset_index(drop=True)

In [7]:

# balanced_movies_data = max_equal_split_on_columnName(movies_data)

balanced_movies_data = balance_gender(movies_data)
balanced_movies_data.shape

(1263, 119)

In [8]:
# # Unique director distribution in max_equal DF
# print("Unique director distribution - Balanced data split based on Gender")

# balanced_movies_data.drop_duplicates(subset = 'director_ids', keep = 'first').Gender.value_counts()

In [9]:
# req_columns = ['review_count_user', 'review_count_critic','rating_value',
#                'rating_count','movie_year','birthYear','Gender']

In [10]:
numericalGender = {"Male": 1, "Female": 0 }

movies_data.Gender.replace(numericalGender,inplace=True)
balanced_movies_data.Gender.replace(numericalGender,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [11]:
object_features_to_remove = movies_data.select_dtypes(include = 'object').columns.values
object_features_to_remove

array(['director_ids', 'director_names', 'movie_id', 'title',
       'release_date'], dtype=object)

In [12]:
# movies_data = movies_data[req_columns]
movies_data = movies_data[movies_data.columns.difference(object_features_to_remove)]
print('The shape of our Unbalanced Movies dataset is:', movies_data.shape)

The shape of our Unbalanced Movies dataset is: (3709, 114)


In [13]:
# balanced_movies_data = balanced_movies_data[req_columns]
balanced_movies_data = balanced_movies_data[balanced_movies_data.columns.difference(object_features_to_remove)]
print('The shape of our Balanced Movies dataset is:', balanced_movies_data.shape)

The shape of our Balanced Movies dataset is: (1263, 114)


In [14]:
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3709 entries, 0 to 3708
Columns: 114 entries, Gender to title_subcategory_video
dtypes: float64(6), int64(108)
memory usage: 3.2 MB


In [15]:
movies_data.Gender.value_counts()

1    3124
0     585
Name: Gender, dtype: int64

In [16]:
balanced_movies_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1263 entries, 5 to 3708
Columns: 114 entries, Gender to title_subcategory_video
dtypes: float64(6), int64(108)
memory usage: 1.1 MB


In [17]:
balanced_movies_data.Gender.value_counts()

1    678
0    585
Name: Gender, dtype: int64

## Divide the data to training and testing - Scaling methods too

Check about the Scaling

In [18]:
def divide_test_train_with_scaling_methods(df,predict='Gender',scalingMethod = "MinMaxScaler"):
    
    
    # Feature matrix and target variable
    X = df.drop(predict,axis = 'columns')
    y = df[predict]
    
    # Scaling
    if (scalingMethod == "MinMaxScaler"):
        print("The scaling method used is : ",scalingMethod)
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X)
        # X_test = scaler.transform(X_test)
        
    else:
        print("No Scaling Method used or incorrect input")
        X_train = X
    
    # Divide the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size=0.20, random_state = 42 )
    
    return X_train, X_test, y_train, y_test

In [19]:
# Divide the data into training and test

# X_train, X_test, y_train, y_test = divide_test_train_with_scaling_methods(movies_data,scalingMethod="None")
X_train, X_test, y_train, y_test = divide_test_train_with_scaling_methods(movies_data,scalingMethod="MinMaxScaler")

The scaling method used is :  MinMaxScaler


In [20]:
# # Divide the data into training and test

# X_train, X_test, y_train, y_test = divide_test_train_with_scaling_methods(balanced_movies_data,scalingMethod="None")
# X_train, X_test, y_train, y_test = divide_test_train_with_scaling_methods(balanced_movies_data,scalingMethod="MinMaxScaler")

# Boosting Classifiers

### 1.  Gradient Boosting Classifier

#### 1a) Identify the optimal Learning rate

In [21]:
# # Script to find the maximum learning_rate where the test_accuracy is maximum

#     optimal_learning_rate = learningRateDF['learning_rate'].loc[learningRateDF.
#                                                                 mask((learningRateDF == learningRateDF.min())
#                                                                      .cumsum()
#                                                                      .astype(bool))
#                                                                 [::-1]
#                                                                 .idxmax()
#                                                                 ['testing_accuracy']]

In [22]:
def optimal_learning_rate(X_train, X_test, y_train, y_test):
    
    lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

    col_learningDF = ['learning_rate','training_accuracy','testing_accuracy']
    learningRateDF = pd.DataFrame(columns=col_learningDF)

    for learning_rate in lr_list:

        gb_clf = GradientBoostingClassifier(n_estimators=100,
                                            learning_rate=learning_rate,
                                            max_leaf_nodes= 32,
                                            max_features=2,
                                            max_depth=2,
                                            random_state= 42 )
        gb_clf.fit(X_train, y_train)

        accuracy_train_score = gb_clf.score(X_train, y_train)
        accuracy_test_score = gb_clf.score(X_test, y_test)

        learningRateDF.loc[len(learningRateDF)] = [learning_rate,accuracy_train_score,accuracy_test_score]
#         print("Learning rate: ", learning_rate)
#         print("Accuracy score (training): {0:.3f}".format(accuracy_train_score))
#         print("Accuracy score (test): {0:.3f}".format(accuracy_test_score))
    print(learningRateDF)

#     optimal_learning_rate = learningRateDF['learning_rate'].loc[learningRateDF.testing_accuracy.argmax()]

    optimal_learning_rate = learningRateDF['learning_rate'].loc[((learningRateDF == learningRateDF.min())
                                                                 .cumsum()
                                                                 .astype(bool))
                                                                [::-1].idxmax()
                                                                ['testing_accuracy']
                                                               ]

    return optimal_learning_rate

In [23]:
chosen_learning_rate_GB = optimal_learning_rate(X_train, X_test, y_train, y_test)
print("\nThe optimal learning rate for XGB Classifier is :",chosen_learning_rate_GB)

   learning_rate  training_accuracy  testing_accuracy
0          0.050           0.840917          0.847709
1          0.075           0.840917          0.847709
2          0.100           0.841591          0.847709
3          0.250           0.846983          0.843666
4          0.500           0.854398          0.846361
5          0.750           0.855747          0.836927
6          1.000           0.856084          0.831536

The optimal learning rate for XGB Classifier is : 1.0


#### 1b) Model Function

In [24]:
def train_model_Gradient_Boosting_classifier(X_train, X_test, y_train, y_test,learning_rate = 1):
    
    # Classifier parameters    
    gb_clf2 = GradientBoostingClassifier(n_estimators=100,
                                         learning_rate=learning_rate,
                                         max_features=2,
                                         max_depth=2,
                                         random_state= 42
                                        )
    gb_clf2.fit(X_train, y_train)
    predictions = gb_clf2.predict(X_test)
    
    gb_train_score = gb_clf2.score(X_train, y_train)
    print("Training Score of Gradient Boosting Classifier: ",round(gb_train_score,3))
    
    gb_test_score = gb_clf2.score(X_test, y_test)
    print("Testing Score of Gradient Boosting Classifier : ",round(gb_test_score,3))

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, predictions))

    print("\nClassification Report")
    print(classification_report(y_test, predictions))
       
    return gb_clf2

#### 1c) Run Results of Gradient Boosting Model

In [25]:
train_model_Gradient_Boosting_classifier(X_train, X_test, y_train, y_test,learning_rate = chosen_learning_rate_GB)
# tn, fp, fn, tp

Training Score of Gradient Boosting Classifier:  0.856
Testing Score of Gradient Boosting Classifier :  0.832

Confusion Matrix:
[[  7 106]
 [ 19 610]]

Classification Report
              precision    recall  f1-score   support

           0       0.27      0.06      0.10       113
           1       0.85      0.97      0.91       629

    accuracy                           0.83       742
   macro avg       0.56      0.52      0.50       742
weighted avg       0.76      0.83      0.78       742



GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=1.0, loss='deviance', max_depth=2,
                           max_features=2, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

### 2. XGB Classifier

#### 2b) Optimal learning rate

In [26]:
def optimal_learning_rate_xgb(X_train, X_test, y_train, y_test):
    
    lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

    col_learningDF = ['learning_rate','training_accuracy','testing_accuracy']
    learningRateDF = pd.DataFrame(columns=col_learningDF)

    for learning_rate in lr_list:

        xgb_clf = XGBClassifier(n_estimators=100,
                                         learning_rate=learning_rate,
                                         max_features=2,
                                         max_depth=2,
                                         random_state= 42)
        
        xgb_clf.fit(X_train, y_train)

        accuracy_train_score = xgb_clf.score(X_train, y_train)
        accuracy_test_score = xgb_clf.score(X_test, y_test)

        learningRateDF.loc[len(learningRateDF)] = [learning_rate,accuracy_train_score,accuracy_test_score]
#         print("Learning rate: ", learning_rate)
#         print("Accuracy score (training): {0:.3f}".format(accuracy_train_score))
#         print("Accuracy score (test): {0:.3f}".format(accuracy_test_score))
    print(learningRateDF)

#     optimal_learning_rate = learningRateDF['learning_rate'].loc[learningRateDF.testing_accuracy.argmax()]

# Script to find the maximum learning_rate where the test_accuracy is maximum

    optimal_learning_rate = learningRateDF['learning_rate'].loc[learningRateDF.
                                                                mask((learningRateDF == learningRateDF.min())
                                                                     .cumsum()
                                                                     .astype(bool))
                                                                [::-1]
                                                                .idxmax()
                                                                ['testing_accuracy']]

    return optimal_learning_rate

In [27]:
chosen_learning_rate_XGB = optimal_learning_rate_xgb(X_train, X_test, y_train, y_test)
print("\nThe optimal learning rate for XGB Classifier is :",chosen_learning_rate_XGB)

   learning_rate  training_accuracy  testing_accuracy
0          0.050           0.840917          0.847709
1          0.075           0.841591          0.847709
2          0.100           0.842265          0.849057
3          0.250           0.851702          0.842318
4          0.500           0.877654          0.842318
5          0.750           0.899562          0.831536
6          1.000           0.915403          0.818059

The optimal learning rate for XGB Classifier is : 0.1


#### 2b) Model Function

In [28]:
def train_model_XGB_classifier(X_train, X_test, y_train, y_test,learning_rate = 1):
    
    # Classifier parameters    

    xgb_clf = XGBClassifier(n_estimators=100,
                                         learning_rate=learning_rate,
                                         max_features=2,
                                         max_depth=2,
                                         random_state= 42)
    xgb_clf.fit(X_train, y_train)
    
    xgb_train_score = xgb_clf.score(X_train, y_train)
    print("Training Score : ",round(xgb_train_score,3))
    
    xgb_test_score = xgb_clf.score(X_test, y_test)
    print("Testing Score : ",round(xgb_test_score,3))
    
    predictions = xgb_clf.predict(X_test)

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))

    print("Classification Report")
    print(classification_report(y_test, predictions))
    
    return xgb_clf

###  2c) Run results of XGB Model

In [29]:
train_model_XGB_classifier(X_train, X_test, y_train, y_test,learning_rate = chosen_learning_rate_XGB)
# tn, fp, fn, tp

Training Score :  0.842
Testing Score :  0.849
Confusion Matrix:
[[  1 112]
 [  0 629]]
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.01      0.02       113
           1       0.85      1.00      0.92       629

    accuracy                           0.85       742
   macro avg       0.92      0.50      0.47       742
weighted avg       0.87      0.85      0.78       742



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=2, max_features=2,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

# End