In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import (classification_report,recall_score,precision_score,accuracy_score)
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [None]:
from google.colab import files
uploaded = files.upload()
data = pd.read_csv('netflix_titles.csv')
display(data.head())

In [None]:
data.drop(['show_id','date_added', 'cast', 'director'],axis = 1, inplace = True)

In [None]:
data.shape
duplicate = data[data.duplicated()]
print('Number of duplicate rows: ', duplicate.shape) 

In [None]:
data.isnull().sum() 

In [None]:
data['country'] = data['country'].fillna(data['country'].mode()[0])
data['country'].isna().sum()

In [None]:
data.isnull().sum()

In [None]:
data.dtypes

In [None]:
data.shape

In [None]:
X = data.iloc[:, 1:]
y = data.iloc[:, 0]
X_non_dummy = data.iloc[:, 1:]
X = pd.get_dummies(X_non_dummy, columns=X_non_dummy.columns)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,stratify=y,random_state=42)
dummy_classifier = DummyClassifier(strategy='most_frequent')
dummy_classifier.fit(X_train,y_train)
baseline_acc = dummy_classifier.score(X_test,y_test)
print("Baseline Accuracy = ", baseline_acc)

In [None]:
model_bagging = BaggingClassifier(n_estimators=10,random_state = 42)
model_bagging.fit(X_train, y_train)
pred_bagging = model_bagging.predict(X_test)
acc_bagging = accuracy_score(y_test, pred_bagging)
print('Accuracy = ', acc_bagging)

In [None]:
model_rf = RandomForestClassifier(n_estimators=100, max_features=8, random_state=42)
model_rf.fit(X_train, y_train)
predict_rf = model_rf.predict(X_test)
recall_rf = recall_score(y_test, predict_rf,average='micro')
precision_rf = precision_score(y_test, predict_rf,average='micro')
print('Accuracy =',precision_rf)

In [None]:
feature_importances = model_rf.feature_importances_
features = X_train.columns
dframe = pd.DataFrame({'features': features, 'importance': feature_importances})
dframe = dframe.sort_values(by='importance', ascending=False)
print(dframe.iloc[0:3, :])

In [None]:
base_est = DecisionTreeClassifier (max_depth = 4)
ada_boost = AdaBoostClassifier(base_est, n_estimators=100, random_state=42, learning_rate=.05)
ada_boost.fit(X_train, y_train)
res1 = round(recall_score(y_test, ada_boost.predict(X_test), average='micro'),4)
print('Accuracy =', res1)

In [None]:
feature_importances = ada_boost.feature_importances_
print(feature_importances)
features = X_train.columns
dframe = pd.DataFrame({'features': features, 'importance': feature_importances})
dframe = dframe.sort_values(by='importance', ascending=False)
print(dframe.iloc[0:3, :])

In [None]:
rfClf = RandomForestClassifier(n_estimators=10, random_state=0) 
svmClf = SVC(probability=True, random_state=0)
logClf = LogisticRegression(random_state=0)
dtClf = DecisionTreeClassifier(max_depth=4)
clf2 = VotingClassifier(estimators = [('rf',rfClf), ('svm',svmClf), ('log', logClf), ('dt', dtClf)], voting='soft') 
clf2.fit(X_train, y_train)
clf2_pred = clf2.predict(X_test)
recall_voting = recall_score(y_test, clf2_pred, average='micro')
precision_voting = precision_score(y_test, clf2_pred, average='micro')
print('Accuracy score =', accuracy_score(y_test, clf2_pred))

In [None]:
import xgboost as xgb
parameters = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}
data_dmatrix = xgb.DMatrix(data=X,label=y)
cv_results = xgb.cv(dtrain=data_dmatrix, parameters=parameters, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

In [None]:
xg_reg = xgb.train(parameters=parameters, dtrain=data_dmatrix, num_boost_round=10)
xgb.plot_importance(xg_reg)
plt.rcParams['figure.figsize'] = [5, 5]
plt.show()