# Machine Learning Model: Logistic Regression
    

In [1]:
# Import dependencies
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import sklearn as skl
import numpy as np
from pathlib import Path
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

## Importing the data

In [2]:
# Import the cleaned, combined CSV file and read it into a Pandas DataFrame
data = Path('./Data Files/machine_learning_titles.csv')
titles_df = pd.read_csv(data)
titles_df

Unnamed: 0,Title,type,country,date_added,release_year,rating,IMDb,Genre,Growth Outcome
0,3%,TV Show,Brazil,2020-08-14,2020,TV-MA,7.4,Drama,1
1,Most Beautiful Thing,TV Show,Brazil,2020-06-19,2020,TV-MA,7.9,Drama,1
2,Omniscient,TV Show,Brazil,2020-01-29,2020,TV-MA,6.3,Drama,1
3,Spectros,TV Show,Brazil,2020-02-20,2020,TV-MA,4.4,Horror,0
4,Rich in Love,Movie,Brazil,2020-04-30,2020,TV-14,6.0,Comedy,1
...,...,...,...,...,...,...,...,...,...
3886,Manoranjan,Movie,India,2017-09-01,1974,TV-14,6.8,Comedy,1
3887,Immoral Tales,Movie,France,2019-06-06,1974,UR,5.6,Drama,1
3888,Monty Python's Flying Circus,TV Show,United Kingdom,2018-10-02,1974,TV-14,8.8,Mystery,1
3889,Professor,Movie,India,2017-07-01,1962,TV-PG,7.0,Comedy,1


## Encode categorical values using a OneHotEncoder

In [3]:
# Drop the titles
titles_df=titles_df.drop(['Title','date_added'],axis='columns')
titles_df

Unnamed: 0,type,country,release_year,rating,IMDb,Genre,Growth Outcome
0,TV Show,Brazil,2020,TV-MA,7.4,Drama,1
1,TV Show,Brazil,2020,TV-MA,7.9,Drama,1
2,TV Show,Brazil,2020,TV-MA,6.3,Drama,1
3,TV Show,Brazil,2020,TV-MA,4.4,Horror,0
4,Movie,Brazil,2020,TV-14,6.0,Comedy,1
...,...,...,...,...,...,...,...
3886,Movie,India,1974,TV-14,6.8,Comedy,1
3887,Movie,France,1974,UR,5.6,Drama,1
3888,TV Show,United Kingdom,1974,TV-14,8.8,Mystery,1
3889,Movie,India,1962,TV-PG,7.0,Comedy,1


In [4]:
# Generate our categorical variable list
titles_cat = titles_df.dtypes[titles_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
titles_df[titles_cat].nunique()

type        2
country    71
rating     14
Genre      15
dtype: int64

In [5]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(titles_df[titles_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(titles_cat)
encode_df

Unnamed: 0,type_Movie,type_TV Show,country_Argentina,country_Australia,country_Austria,country_Bangladesh,country_Belgium,country_Brazil,country_Bulgaria,country_Cambodia,...,Genre_Documentaries,Genre_Drama,Genre_Horror,Genre_Musicals,Genre_Mystery,Genre_Reality,Genre_Romance,Genre_Sci-Fi & Fantasy,Genre_Teen,Genre_Thrillers
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3886,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3887,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3888,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3889,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Merge one-hot encoded features and drop the originals
titles_df = titles_df.merge(encode_df,left_index=True, right_index=True)
titles_df = titles_df.drop(titles_cat,1)
titles_df

Unnamed: 0,release_year,IMDb,Growth Outcome,type_Movie,type_TV Show,country_Argentina,country_Australia,country_Austria,country_Bangladesh,country_Belgium,...,Genre_Documentaries,Genre_Drama,Genre_Horror,Genre_Musicals,Genre_Mystery,Genre_Reality,Genre_Romance,Genre_Sci-Fi & Fantasy,Genre_Teen,Genre_Thrillers
0,2020,7.4,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020,7.9,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020,6.3,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020,4.4,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2020,6.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3886,1974,6.8,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3887,1974,5.6,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3888,1974,8.8,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3889,1962,7.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Setting up the model

In [7]:
# Separate the Features(X) from the Target (y)
y = titles_df["Growth Outcome"]
X = titles_df.drop(columns = "Growth Outcome")

In [8]:
# Split our data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(2918, 104)

In [9]:
# Create a Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs', random_state=1, max_iter=200)
classifier

LogisticRegression(max_iter=200, random_state=1)

In [10]:
# Fit(train) our model using the training data
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=200, random_state=1)

In [11]:
# Validate the model using the training data
y_train_pred = classifier.predict(X_train)
training_results = pd.DataFrame({"Prediction": y_train_pred, "Actual": y_train}).reset_index(drop=True)
training_results.head(20)
accuracy_score(y_train, y_train_pred)

0.8690884167237835

## Make predicitions

In [12]:
# Predict outcomes for data set
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,0,0
3,1,1
4,0,0
5,0,0
6,1,1
7,1,1
8,0,0
9,0,0


## Validate the model with test data

In [13]:
# Validate the model using the test data
accuracy_score(y_test, y_pred)

0.8448098663926003

In [14]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[399,  57],
       [ 94, 423]], dtype=int64)

In [15]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.81      0.88      0.82      0.84      0.85      0.72       456
          1       0.88      0.82      0.88      0.85      0.85      0.71       517

avg / total       0.85      0.84      0.85      0.84      0.85      0.72       973



## Using Model to predict Outcomes on new Dataset

In [16]:
# Import the data, combined CSV file and read it into a Pandas DataFrame
new_df = pd.read_csv('Trial_dataset.csv')
new_df

Unnamed: 0,Title,type,country,date_added,release_year,rating,duration,description,IMDb,Genre
0,Bridgerton,Tv Show,United Kingdom,25-Dec-20,2020,TV-MA,1 season,"Wealth, lust, and betrayal set against the bac...",7.3,Drama
1,Queer Eye,Tv Show,United States,7-Feb-18,2018,TV-MA,5 Seasons,A new Fab Five set out to Atlanta to help some...,8.5,Reality
2,Molly's Game,Movie,United States,16-Dec-19,2017,R,140 minutes,"The true story of Molly Bloom, an Olympic-clas...",7.4,Crime
3,Extraction,Movie,Bangladesh,24-Apr-20,2020,R,117 minutes,"a fearless black market mercenary, embarks on ...",6.7,Action & Adventure
4,Emily in Paris,Tv Show,France,2-Oct-20,2020,TV-MA,1 season,A young American woman from the Midwest is hir...,7.1,Comedy


In [None]:
# Drop description
new_df=new_df.drop(['description'],axis='columns')
new_df

In [None]:
# Create Features(X) 
X_new = new_df

In [None]:
# Predict outcomes for data set
y_predict_new = classifier.predict(X_new)
new_results = pd.DataFrame({"Prediction": y_predict_new}).reset_index(drop=True)
new_results.head(20)