## Imports

In [100]:
import ast
import time
import pickle
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Global variables

In [101]:
scaler = MinMaxScaler()

# One hot encoding
genres_mlb = MultiLabelBinarizer()
spoken_languages_mlb = MultiLabelBinarizer()
production_countries_mlb = MultiLabelBinarizer()

# Label encoding
original_language_encoder = LabelEncoder()
status_encoder = LabelEncoder()
name_encoder = LabelEncoder()
character_encoder = LabelEncoder()

# Hashing encoding
keywords_hash_num = 30    #Change this Number for more accuracy
keywords_hash_columns = []
hash_column_name = ''

for i in range(keywords_hash_num):
  hash_column_name = 'keywords_hash_' + str(i)
  keywords_hash_columns.append(hash_column_name)

keywords_hasher = FeatureHasher(n_features=keywords_hash_num, input_type='string')


production_companies_hash_num = 20 #Change this Number for more accuracy
production_companies_hash_columns = []
hash_column_name = ''

for i in range(production_companies_hash_num):
  hash_column_name = 'production_companies_hash_' + str(i)
  production_companies_hash_columns.append(hash_column_name)

production_companies_hasher = FeatureHasher(n_features=production_companies_hash_num, input_type='string')


tfidf  = TfidfVectorizer(stop_words='english')


cast_num = 1  
cast_columns = []
cast_character = ''
cast_gender = ''
cast_name = ''
cast_order = ''

for i in range(cast_num):
  cast_character = 'cast_' + str(i) + '_character'
  cast_gender = 'cast_' + str(i) + '_gender'
  cast_name = 'cast_' + str(i) + '_name'
  cast_order = 'cast_' + str(i) + '_order'

  cast_columns.append(cast_character)
  cast_columns.append(cast_gender)
  cast_columns.append(cast_name)
  cast_columns.append(cast_order)

rate_encoder = LabelEncoder()


missingValues = {
    'budget'  :  None,
    'genres'  :  None,
    'homepage'  :  None,
    'id'  :  None,
    'keywords'  :  None,
    'original_language'  :  None,
    'original_title'  :  None,
    'overview'  :  None,
    'viewercount'  :  None,
    'production_companies'  :  None,
    'production_countries'  :  None,
    'release_date'  :  None,
    'revenue'  :  None,
    'runtime'  :  None,
    'spoken_languages'  :  None,
    'status'  :  None,
    'tagline'  :  None,
    'title'  :  None,
    'vote_count'  :  None,
    'cast'  :  None,
    'crew'  :  None,
}


## Helper functions

### Load the models

In [102]:

# Load the encoding objects
def load_encoders():
    with open('encoders.pkl', 'rb') as file:
        encodings = pickle.load(file)

        
    genres_mlb = encodings['genres_mlb']
    spoken_languages_mlb = encodings['spoken_languages_mlb']
    production_countries_mlb = encodings['production_countries_mlb']
    original_language_encoder = encodings['original_language_encoder']
    status_encoder = encodings['status_encoder']
    name_encoder = encodings['name_encoder']
    character_encoder = encodings['character_encoder']
    rate_encoder = encodings['rate_encoder']
    
    return genres_mlb, spoken_languages_mlb, production_countries_mlb, original_language_encoder, status_encoder, name_encoder, character_encoder, rate_encoder

# Load the feature hashers
def load_hashers():
    with open('hashers.pkl', 'rb') as file:
        hashing = pickle.load(file)

    keywords_hasher = hashing['keywords_hasher']
    production_companies_hasher = hashing['production_companies_hasher']

    return keywords_hasher, production_companies_hasher
    

# Load the TF-IDF vectorizer
def load_tfidf_vectorizer():
    with open('tfidf_vectorizer.pkl', 'rb') as file:
        return pickle.load(file)


# Load the scalers
def load_scalers():
    with open('scalers.pkl', 'rb') as file:
        return pickle.load(file)

# Load the feature selection
def load_feature_selection():
    with open('feature_selection.pkl', 'rb') as file:
        return pickle.load(file)

# Load the models
def load_model(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)
    
# Load the missing values
def load_missing_values():
    with open('missing_values.pkl', 'rb') as file:
        return pickle.load(file)



### Adding a new label in label encoding

In [103]:
def unseen(data, encoder, columnName):
    unseen_values = list(set(data[columnName]) - set(encoder.classes_))
    if unseen_values:
        for unseen in unseen_values:
            if unseen not in encoder.classes_:
                new_label = max(encoder.transform(encoder.classes_)) + 1
                encoder.classes_ = np.append(encoder.classes_, unseen)
                encoder.transform([unseen])[0] = new_label



### Converting list of dictionaries to normal list of elements


In [104]:
def transformColumn(enteredData, columnName, dictionaryKey):
    enteredData[columnName] = [ast.literal_eval(row) for row in enteredData[columnName]]
    for index, row in enteredData[columnName].items():
        finalList = []
        for j in range(len(row)):
            finalList.append(row[j][dictionaryKey])
        enteredData.at[index, columnName] = finalList


In [105]:
def transformMoviesColumns(enteredData, columnName, dictionaryKey1, dictionaryKey2, dictionaryKey3, dictionaryKey4):
  enteredData[columnName] = [ast.literal_eval(row) for row in enteredData[columnName]]
  for index, row in enteredData[columnName].items():
    finalList = []
    for j in range(len(row)):
      fList = []
      fList.append(row[j][dictionaryKey1])
      fList.append(row[j][dictionaryKey2])
      fList.append(row[j][dictionaryKey3])
      fList.append(row[j][dictionaryKey4])
      finalList.append(fList)
    enteredData.at[index, columnName] = finalList


### Filling missing values

In [106]:

def fillMissingTestData(data):
    missingValues = load_missing_values()
    
    categoralColumns = ['genres', 'keywords', 'spoken_languages',
                        'production_companies', 'production_countries', 'cast', 'crew']
    for i in categoralColumns:
        data[i] = data[i].apply(lambda x: x if x else missingValues[i])
   
    numericalColumns = ['budget', 'id', 'viewercount',
                    'release_date', 'revenue', 'runtime', 'vote_count']
    for i in numericalColumns:
        data[i] = data[i].replace(0, missingValues[i])
        data[i] =  data[i].fillna(missingValues[i])

    textualColumns = ['homepage', 'original_title', 'tagline',
                      'title', 'status', 'overview', 'original_language']
    for i in textualColumns:
        data[i] =  data[i].fillna(missingValues[i])

    return data
    


### Add the cast and crew columns to X dataFrame

In [107]:
def join_columns(data, movies):
  # Add two empty columns to the DataFrame
  data = data.join(pd.DataFrame(movies['cast'], columns=['cast'], index = data.index))
  data = data.join(pd.DataFrame(movies['crew'], columns=['crew'], index = data.index))

  # Put the cast and crew in the right cells
  for dataIndex, dataRow in data.iterrows():
    if dataRow['id'] in movies['movie_id'].values:
      index = movies.loc[movies['movie_id'] == dataRow['id']].index[0]
      data.at[dataIndex, 'cast'] = movies.at[index, 'cast']
      data.at[dataIndex, 'crew'] = movies.at[index, 'crew']
    else:
      data.at[dataIndex, 'cast'] = '[]'
      data.at[dataIndex, 'crew'] = '[]'
  
  return data


## Preprocessing 

In [108]:

def nonModels_preprocessing_script(data):
    # converting list of dectionaries to normal lists of elements using "transformColumn" function
    # genres column
    transformColumn(data, 'genres', 'name')
    # keywords column
    transformColumn(data, 'keywords', 'name')
    # production_companies column
    transformColumn(data, 'production_companies', 'name')
    # production_countries column
    transformColumn(data, 'production_countries', 'name')
    # spoken_languages column
    transformColumn(data, 'spoken_languages', 'iso_639_1')
    # cast column
    transformMoviesColumns(data, 'cast', 'character', 'gender', 'name', 'order')
    # crew column
    transformMoviesColumns(data, 'crew', 'name', 'department', 'gender', 'job')

    # Only leaving the year from the release date column
    data['release_date'] = data['release_date'].str[-4:].astype(int)

    data = fillMissingTestData(data)

    # Placing the values from the cast column to a column of it's own  
    data = data.join(pd.DataFrame(columns=cast_columns, index = data.index))
    for index, row in data.iterrows():
      k = 0
      for j in range(cast_num):
        if(j > len(row['cast']) - 1):
          break 
        data.at[index, cast_columns[k]] = row['cast'][j][0]
        k += 1
        data.at[index, cast_columns[k]] = row['cast'][j][1]
        k += 1
        data.at[index, cast_columns[k]] = row['cast'][j][2]
        k += 1
        data.at[index, cast_columns[k]] = row['cast'][j][3]
        k += 1

    data.drop('cast',axis=1,inplace=True)
    
    
    return data


### Preprocessing test Script

In [109]:

# Apply the loaded encodings to the test data
def apply_encodings(data, y):
    # Load the encoding objects, hashers, vectorizer, and scalers
    genres_mlb, spoken_languages_mlb, production_countries_mlb, original_language_encoder, status_encoder, name_encoder, character_encoder, rate_encoder =  load_encoders()
    keywords_hasher, production_companies_hasher =  load_hashers()
    tfidf = load_tfidf_vectorizer()
    scaler = load_scalers()
    
    # Apply one-hot encoding
    # genres column
    data = data.join(pd.DataFrame(genres_mlb.transform(data.pop('genres')),
                                    columns=genres_mlb.classes_,
                                    index=data.index))
    
    # spoken_languages column
    data = data.join(pd.DataFrame(spoken_languages_mlb.transform(data.pop('spoken_languages')),
                                    columns=spoken_languages_mlb.classes_,
                                    index=data.index))
    # production_countries column 
    data = data.join(pd.DataFrame(production_countries_mlb.transform(data.pop('production_countries')),
                                    columns=production_countries_mlb.classes_,
                                    index=data.index))
    

    # Apply lable encoding
    # original_language column
    unseen(data, original_language_encoder, 'original_language')
    data['original_language'] = original_language_encoder.transform(data['original_language'])
    # status column
    unseen(data, status_encoder, 'status')
    data['status'] = status_encoder.transform(data['status'])
    # cast column
    for i in range(cast_num):
        cast_name = 'cast_' + str(i) + '_name'
        
        unseen(data, name_encoder, cast_name)
        data[cast_name] = name_encoder.transform(data[cast_name])


        cast_character = 'cast_' + str(i) + '_character'

        unseen(data, character_encoder, cast_character)
        data[cast_character] = character_encoder.transform(data[cast_character])
            
    # Apply hashing encoding
    # keywords column
    data = data.join(pd.DataFrame((keywords_hasher.transform(data.pop('keywords')).toarray()), columns=keywords_hash_columns, index=data.index))
    # production_companies column
    data = data.join(pd.DataFrame((production_companies_hasher.transform(data.pop('production_companies')).toarray()), columns=production_companies_hash_columns, index=data.index))


    # Apply TF-IDF 
    overview_vectors  = tfidf.transform(data['overview'])        
    data['overview'] = list(overview_vectors.toarray())
    data['overview'] = data['overview'].apply(lambda x: sum(x) / len(x))


    # Normalizing the numerical columns
    num_cols = data.select_dtypes(include=['int', 'float']).columns.tolist()
    num_cols.append("vote_count")
    # print(len(num_cols))

    # for column in num_cols:
    #     print(f"Column '{column}': {data[column].dtype}")
    

    data[num_cols] = scaler.transform(data[num_cols])


        
    data.drop('homepage',axis=1,inplace=True)
    data.drop('id',axis=1,inplace=True)
    data.drop('original_title',axis=1,inplace=True)
    data.drop('tagline',axis=1,inplace=True)
    data.drop('title',axis=1,inplace=True)
    data.drop('crew',axis=1,inplace=True)

    y = rate_encoder.transform(y)

    return data, y



## Test script

In [110]:
    
# Test script
def test_script(data, movies):

    data = pd.read_csv(data)
    X = data.iloc[:, :19] 
    Y = data['Rate']
    movies = pd.read_csv(movies)

    X = join_columns(X, movies)

    X = nonModels_preprocessing_script(X)
    X, Y = apply_encodings(X, Y)

    rfe = load_feature_selection()
    X = rfe.transform(X)

    best_dt = load_model('best_dt_model.pkl')
    y_pred = best_dt.predict(X)
    best_dt_accuracy = accuracy_score(Y, y_pred)
    print(f"Decision Tree Classifier accuracy: {best_dt_accuracy:.2f}%")

    best_rf  = load_model('best_rf_model.pkl')
    y_pred = best_rf.predict(X)
    best_rf_accuracy = accuracy_score(Y, y_pred)
    print(f"Random Forest Classifier accuracy: {best_rf_accuracy:.2f}%")

    best_lr  = load_model('best_lr_model.pkl')
    y_pred = best_lr.predict(X)
    best_lr_accuracy = accuracy_score(Y, y_pred)
    print(f"Logistic Regression Classifier accuracy: {best_lr_accuracy:.2f}%")


    
        

# Test the model using a new CSV file
test_script('movies-tas-test.csv', 'credit-tas-test.csv')



  .format(sorted(unknown, key=str)))
  .format(sorted(unknown, key=str)))


Decision Tree Classifier accuracy: 0.28%
Random Forest Classifier accuracy: 0.53%
Logistic Regression Classifier accuracy: 0.27%
