In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("Movie_classification.csv", header=0)
df.head()

In [None]:
df.info()

### Missing Value Imputation

In [None]:
df['Time_taken'].mean()

In [None]:
df['Time_taken'].fillna(value = df['Time_taken'].mean(), inplace = True)

In [None]:
df.info()

### Dummy Variable Creation

In [None]:
df.head()

In [None]:
df = pd.get_dummies(df,columns = ["3D_available","Genre"],drop_first = True)

In [None]:
df.head()

### X-y split

In [None]:
X = df.drop(["Start_Tech_Oscar"], axis = 1)
y = df["Start_Tech_Oscar"]

### Test-Train Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=0)

### Training Classification Tree

In [None]:
from sklearn import tree
clftree = tree.DecisionTreeClassifier()

In [None]:
clftree.fit(X_train, y_train)

### Predict values using trained model

In [None]:
#y_train_pred = clftree.predict(X_train)
y_test_pred = clftree.predict(X_test)

In [None]:
y_test_pred

### Model Performance

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
confusion_matrix(y_test, y_test_pred)

In [None]:
accuracy_score(y_test, y_test_pred)

### Plotting decision tree

In [None]:
from sklearn import tree
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(clftree, 
                   feature_names=X.columns,
                   filled=True)

## Controlling Tree growth

In [None]:
clftree2 = tree.DecisionTreeClassifier(min_samples_leaf = 20, max_depth=4)
clftree2.fit(X_train, y_train)
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(clftree2, 
                   feature_names=X.columns,
                   filled=True)

In [None]:
accuracy_score(y_test, clftree2.predict(X_test))

### Random Forest

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor

- The n_estimaotors parameter decide the total number of trees in the forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_clf = RandomForestClassifier()

In [None]:
rf_clf.fit(X_train, y_train)

In [None]:
confusion_matrix(y_test, rf_clf.predict(X_test))

In [None]:
accuracy_score(y_test, rf_clf.predict(X_test))

# RandomSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Getting parameter dictionary
params = {
    'max_leaf_nodes' : list(range(2, 10)),
    'min_samples_split' : [2, 3, 4],
    'criterion' : ['gini', 'entropy'],
    'max_depth' : list(range(1, 10))
}


In [None]:
# Initializing the GridSearchCV
clf = tree.DecisionTreeClassifier(random_state = 17)
rs_cv = RandomizedSearchCV(clf, params, cv = 5, n_iter = 2)
rs_cv.fit(X_train, y_train)

In [None]:
rs_cv.best_estimator_

### Grid Search

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Getting parameter dictionary
params = {
    'max_leaf_nodes' : [3, 4, 5, 6, 7],
    'min_samples_split' : [2, 3, 4],
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [3,4,5,6]
}

In [None]:
# Initializing the GridSearchCV
clf = tree.DecisionTreeClassifier()
grid_search_cv = GridSearchCV(clf, params,cv=3)
grid_search_cv.fit(X_train, y_train)

In [None]:
grid_search_cv.best_params_ 

In [None]:
cvrf_clf = grid_search_cv.best_estimator_
cvrf_clf

In [None]:
accuracy_score(y_test, cvrf_clf.predict(X_test))

In [None]:
confusion_matrix(y_test, cvrf_clf.predict(X_test))

# Applying Decision Tree with the best parameters

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc_best = DecisionTreeClassifier(criterion = 'gini', max_depth = 5, max_leaf_nodes = 6, min_samples_split = 2)

In [None]:
dtc_best.fit(X_train, y_train)

In [None]:
dtc_best.predict(X_test)

## Best parameters for Random Forest

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Getting parameter dictionary
params = {
    'n_estimators' : list(range(10, 10000)),
    'max_leaf_nodes' : list(range(2, 10)),
    'min_samples_split' : [2, 3, 4],
    'criterion' : ['gini', 'entropy'],
    'max_depth' : list(range(1, 10))
}


In [None]:
# Initializing the GridSearchCV
rs_cv = RandomizedSearchCV(rf_clf, params, cv = 5, n_iter = 2)
rs_cv.fit(X_train, y_train)

In [None]:
rs_cv.best_estimator_

In [None]:
# Getting parameter dictionary
params = {
    'n_estimators' : [4600, 4700, 4800, 5000],
    'max_leaf_nodes' : [3, 4, 5, 6, 7],
    'min_samples_split' : [2, 3, 4],
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [3,4,5,6]
}

In [None]:
# Initializing the GridSearchCV
grid_search_cv = GridSearchCV(rf_clf, params,cv=3)
grid_search_cv.fit(X_train, y_train)

In [None]:
grid_search_cv.best_parameters_