In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import sklearn.metrics as skMetrics
from sklearn.metrics import confusion_matrix


#Read Data
data = pd.read_csv("imdb_top_1000.csv")
#fixed dataTypes
data.Released_Year=data.Released_Year.replace(["PG"],[1990])
data.Released_Year=data.Released_Year.astype(int)
data.Runtime=data.Runtime.str.strip(" min")
data.Runtime=data.Runtime.astype(int)
data.Gross=data.Gross.str.replace(",","")
data.Gross=data.Gross.astype(float)
data.dtypes

FileNotFoundError: ignored

In [None]:
#Show All data
data

In [None]:
#Show the first 10 rows.
data[0:10]


In [None]:
#Show the last 10 rows.
data[-10:]

In [None]:
#Print the name of all the columns.
print("Rows : "+str(data.shape[0])+" Columns : "+str(data.shape[1]))

In [None]:
#Print the name of all the columns.
columns =['']*16
for i in range(16) :
    columns[i]=data.columns[i]
print(columns)

In [None]:
#Print the name of the last column.
print("Last Column name : "+columns[-1])

In [None]:
#What is the type of 4th column?
print("4th coloumn type:"+str(data.dtypes[3]))

In [None]:
#Choose a categorical column and find how many different types in this column?
print("Number of different types in column 'Certificate' :"+str(len(data["Certificate"].unique())))

In [None]:
#What is the most frequent or most one occurred in the column chosen in part g?
print("Most frequent type in column 'Certificate': "+data["Certificate"].value_counts().idxmax())

In [None]:
#Find all mean, standard deviation and percentile (25%, 50% and 75% ) for all numerical values.
print(data.describe())

In [None]:
#Data Preparation

#Filter out rows with any specific condition you prefer (for example select students with grades
#higher than 60 out 100).
data.loc[(data["IMDB_Rating"]>=8.5) &(data["No_of_Votes"]>=1000000)]

In [None]:
#Select the names that start with Y and count them.
print("Count : "+str(data[data.Series_Title.str.startswith('Y')].shape[0]))
data[data.Series_Title.str.startswith('Y')]

In [None]:
#Show general statistics (mean, median, max, min, percentile,....) in any categorical attribute.
print(data.Certificate.describe())

In [None]:
#Change type of numerical column from int to object.
print("Meta_score old type:"+str(data.dtypes[8]))
data['Meta_score']=data['Meta_score'].astype('object')
print("Meta_score new type:"+str(data.dtypes[8]))
data['Meta_score']=data['Meta_score'].astype(float)

In [None]:
#Group the entire data set by any two categorical attribute.
goruped_data=data.groupby(['Certificate','Director'])
goruped_data.first()

In [None]:
#Are there any missing values in the data set?
print(data.isna().sum())

In [None]:
#Find the index of rows with any row with 2 missing values
arr=data.isnull().sum(axis=1)
i=0;
for element in arr :
  if(element==2) :
    print(i)
  i+=1

In [None]:
#If there are any NaN values in any column, substitute these missing values with the mean/mode of
#this column
data.Certificate.fillna(data.Certificate.mode()[0], inplace=True)
data.Meta_score.fillna(value=data.Meta_score.mean(), inplace=True)
data.Gross.fillna(value=data.Gross.mean(), inplace=True)
print(data.isna().sum())

In [None]:
# Find the total number of duplicated rows and drop them if there are any.
print("Sum of Duplicates : "+str(data.duplicated().sum()))

In [None]:
# Discretize any numerical column into 6 groups using equal interval method and count the values in
#each bin/Group. Make sure intervals are sorted when counting them).
pd.cut(data.IMDB_Rating, 6,precision=0,labels=["Very Low Score","Low Score","Average Score","High Score","Very High Score","Extremly High Score"])


In [None]:
#Select row with maximum value of a chosen numerical variable.
data[data.No_of_Votes == data.No_of_Votes.max()]

In [None]:
#Make a suitable boxplot with any chosen attribute from the data set (Think about why chosing
#this attribute before plotting).
plt.boxplot(data.Released_Year)

In [None]:
#Make a suitable histogram with any chosen attribute from the data set (Think about why chosing
#this attribute before plotting).
plt.hist(data.Runtime)

In [None]:
#Make a suitable scatterplot with any chosen attributes from the data set (Think about why chosing
#this attribute before plotting).
plt.scatter(data.Released_Year, data.No_of_Votes)

In [None]:
#Testing and Training

#discretizing gross coloumn into 3 values
data.Gross=pd.cut(data.Gross, 3,precision=0,labels=['Low','Mid','High'])

#Spliting data into labels and features
X = data[['IMDB_Rating', 'Released_Year', 'Runtime', 'Meta_score']]
y = data['Gross']

#splitting data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=104, test_size=0.20, shuffle=True)

In [None]:
#K nearest neighbor Model

# Initialize the model
knn_model = KNeighborsClassifier(n_neighbors=3)

# Fit the model to the training data
knn_model.fit(X_train, y_train)

# Predict on the test data
Ky_pred = knn_model.predict(X_test)

In [None]:
#K nearest neighbor Model Performance metrics
Kaccuracy = knn_model.score(X_test, y_test)*100
print("Accuracy = ", Kaccuracy)
Kprecision = skMetrics.precision_score(y_test, Ky_pred,zero_division=1, average='macro')*100
print("Precision = ",Kprecision)
Krecall = skMetrics.recall_score(y_test, Ky_pred, average='macro')*100
print("Recall = ",Kprecision)
Kf_score = skMetrics.f1_score(y_test, Ky_pred, average='macro')*100
print("f_score = ",Kf_score)
KcMatrix = confusion_matrix(y_test, Ky_pred)
print("Confusion Matrix : ")
print(KcMatrix)


In [None]:
#Naive Bayes Model

# Initialize the model
naiveBayes_model = GaussianNB()

# Fit the model to the training data
naiveBayes_model.fit(X_train, y_train)

# Predict on the test data
Ny_pred = naiveBayes_model.predict(X_test)

In [None]:
#Naive Bayes Performance metrics
Naccuracy = naiveBayes_model.score(X_test, y_test)*100
print("Accuracy = ", Naccuracy)
Nprecision = skMetrics.precision_score(y_test, Ny_pred,zero_division=1, average='macro')*100
print("Precision = ",Nprecision)
Nrecall = skMetrics.recall_score(y_test, Ny_pred, average='macro')*100
print("Recall = ",Nprecision)
Nf_score = skMetrics.f1_score(y_test, Ny_pred, average='macro')*100
print("f_score = ",Nf_score)
NcMatrix = confusion_matrix(y_test, Ny_pred)
print("Confusion Matrix : ")
print(NcMatrix)


In [None]:
#Decision Tree Model
# Initialize the model
decisionTree_model = DecisionTreeClassifier()

# Fit the model to the training data
decisionTree_model.fit(X_train, y_train)

# Predict on the test data
Dy_pred = decisionTree_model.predict(X_test)

In [None]:
#Decision Tree Performance metrics
Daccuracy = decisionTree_model.score(X_test, y_test)*100
print("Accuracy = ", Daccuracy)
Dprecision = skMetrics.precision_score(y_test, Dy_pred,zero_division=1, average='macro')*100
print("Precision = ",Dprecision)
Drecall = skMetrics.recall_score(y_test, Dy_pred, average='macro')*100
print("Recall = ",Dprecision)
Df_score = skMetrics.f1_score(y_test, Dy_pred, average='macro')*100
print("f_score = ",Df_score)
DcMatrix = confusion_matrix(y_test, Dy_pred)
print("Confusion Matrix : ")
print(DcMatrix)


In [None]:
#Comparison
def largest(num1, num2, num3):
    if (num1 >= num2) and (num1 >= num3):
        return (str(num1)," From KNN algorithm")
    elif (num2 >= num1) and (num2 >= num3):
        return (str(num2)," From Naive Bayes algorithm")
    else:
        return (str(num3)," From Decision Tree algorithm")
print("Best Accuracy is : ",largest(Kaccuracy,Naccuracy,Daccuracy))
print("Best Precision is : ",largest(Kprecision,Nprecision,Dprecision))
print("Best Recall is : ",largest(Krecall,Nrecall,Drecall))
print("Best f_score is : ",largest(Kf_score,Nf_score,Df_score))
print()

def middle(num1, num2, num3):
    if num2 <= num1 <= num3 or num3 <= num1 <= num2:
        return (str(num1)," From KNN algorithm")
    elif num1 <= num2 <= num3 or num3 <= num2 <= num1:
        return (str(num2)," From Naive Bayes algorithm")
    else:
        return (str(num3)," From Decision Tree algorithm")

print("Middle Accuracy is : ",middle(Kaccuracy,Naccuracy,Daccuracy))
print("Middle Precision is : ",middle(Kprecision,Nprecision,Dprecision))
print("Middle Recall is : ",middle(Krecall,Nrecall,Drecall))
print("Middle f_score is : ",middle(Kf_score,Nf_score,Df_score))
print()

def smallest(num1, num2, num3):
    if num1 < num2 and num1 < num3:
        return (str(num1)," From KNN algorithm")
    elif num2 < num1 and num2 < num3:
        return (str(num2)," From Naive Bayes algorithm")
    else:
        return (str(num3)," From Decision Tree algorithm")
print("Worst Accuracy is : ",smallest(Kaccuracy,Naccuracy,Daccuracy))
print("Worst Precision is : ",smallest(Kprecision,Nprecision,Dprecision))
print("Worst Recall is : ",smallest(Krecall,Nrecall,Drecall))
print("Worst f_score is : ",smallest(Kf_score,Nf_score,Df_score))

