In [1]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('netflix_titles.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'netflix_titles.csv'

In [None]:
#Displays the data-types within the Data Frame
df.info()

In [None]:
#Displays the number of rows and columns
df.shape

In [None]:
#Displays the number of null values within the data-set
df.isnull().sum()

In [None]:
#Fills all NaN values within the dataframe

#Fills NaN values within the Director column 
df['director'] = df['director'].fillna('Unknown Director')

#Fills NaN values within the Country column 
df['country'] = df['country'].fillna('Unknown Country')

#Fills NaN values within the Rating column 
df['rating'] = df['rating'].fillna('Unknown Rating')

#Fills NaN values within Cast column 
df['cast'] = df['cast'].fillna('Unknown Cast')

In [None]:
#date_added will be converted from object to date-time
df['date_added'] = pd.to_datetime(df['date_added'])
#Converts the mm-dd-yyyy formatted values to the year values. 
#This step will allow for an easier data-cleansing process
df['date_added'] = df['date_added'].dt.strftime('%Y')

In [None]:
#Use rows where date_added is not na (deleting rows where date_added is na)
#There was a small amount and no easy way to fill in nan values for date added
df = df[df['date_added'].notna()]

In [None]:
#See rows are deleted
df.shape

In [None]:
#Change type from object to int
df['date_added'] = df['date_added'].astype(int)

In [None]:
#Fills all NaN values with 0 (This is not a permanent mutation) as all will be ints to better analyze
df['duration'] = df['duration'].fillna("0")

In [None]:
#Need to solve issue of duration column including minutes and seasons
#Will change all into ints and when analyzing need to separate tv shows and movies to accuratly use the data
#Tv shows will be number of seasons and movies will be number of minutes
#Converts the formatt xx mins to xx
df['duration'] = df['duration'].str.replace("min", "")
#Converts the format x season to x
df['duration'] = df['duration'].str.replace("Season", "")
df['duration'] = df['duration'].str.replace("s", "")

In [None]:
#converts string values into float 
df['duration'] = df['duration'].astype(int)

In [None]:
#Removes the show_id column, not necessary for our analysis 
df = df.drop(['show_id'],axis=1)

In [None]:
#Replaces xx min ratings with unknown ratings
df['rating'] = df['rating'].str.replace("min", "Unknown Rating")
df['rating'] = df['rating'].str.replace("74", "")
df['rating'] = df['rating'].str.replace("84", "")
df['rating'] = df['rating'].str.replace("66", "")

#removes leading white space
df['rating']=df['rating'].str.strip()

In [None]:
#Encodes each column type
#Encodes the categorical value of Type 
type_enc = LabelEncoder()
type_enc.fit(df['type'])
df['type_enc'] = type_enc.transform(df['type'])

df.drop('type', axis='columns', inplace=True)

#Encodes the catergorical values of Rating
rating_enc = LabelEncoder()
rating_enc.fit(df['rating'])
df['rating_enc'] = rating_enc.transform(df['rating'])

df.drop('rating', axis='columns', inplace=True)

#Encodes the Country column
country_enc = LabelEncoder()
country_enc.fit(df['country'])
df['country_enc'] = country_enc.transform(df['country'])

df.drop('country', axis='columns', inplace=True)

In [None]:
# Split the dataset into train and test sets

#divide the data into the input 'X' and the labels 'y'
X        = df[['country_enc','release_year', 'rating_enc', 'duration']].copy() #the observations
y        = df['type_enc'].copy() #the label

In [None]:
#split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2, #represent the proportion of the dataset to include in the test split
                                                    random_state=42) #Controls the shuffling applied to the data before applying the split

In [None]:
import numpy as np
#Creates an array of integers containing k-values ranging between 1-30
neighbors = np.arange(1,30)

#Initializes two empty arrays with the length of the array of neighbors
#Initializes a list that will contain the Accuracy of the Train models
accuracy_train = np.empty(len(neighbors)) #empty initialzed list of train accuracy values
#Initializes a list that will contain the Accuracy of the Train models
accuracy_test = np.empty(len(neighbors)) #empty initialzed list of test accuracy values

#reiterates through each k-value [1,30] and stores each accuracy value into its respective array
for i, k in enumerate(neighbors):
    #neighbors = k 
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    #Computes accuracy of the train model of the given k-value i
    accuracy_train[i] = knn.score(X_train, y_train)
    #Computes accuracy of the test model of the given k-value i
    accuracy_test[i] = knn.score(X_test, y_test)
    
#Visualizes the accuracy of the Test and Train models
plt.plot(neighbors, accuracy_test, label = 'Testing Model Accuracy')
plt.plot(neighbors, accuracy_train, label = 'Training Model Accuracy')
plt.title('The Number of K-Neighbors and the Accuracy of Training/Testing Accuracy')
plt.legend()
plt.xlabel('K-Value')
plt.ylabel('Accuracy')
plt.show()

In [None]:
#Displays the K-NN model with the most accurate K value 
knn = KNeighborsClassifier(n_neighbors=10)
#Fits the model
knn.fit(X_train, y_train)
#Prints the Train and Test Accuracy of the K-NN Model when K = 10
print("test accuracy is:", "{:.0%}".format(knn.score(X_test, y_test)))
print("train accuracy is:", "{:.0%}".format(knn.score(X_train, y_train)))

#obtains the prediction made from the model
y_pred = knn.predict(X_test)

#displays the classification report when k = 10 
print(classification_report(y_test,y_pred))