In [1]:
import pandas as pd
import numpy as np
import math
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import math
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.datasets import fetch_openml
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

## Q1: Collaborative Filtering

 ### Extracting data into dataframes

In [2]:
testRatings = pd.read_csv("netflix\\TestingRatings.txt", header = None)
trainRatings = pd.read_csv("netflix\\TrainingRatings.txt", header = None)

movieTitles = open("netflix\\movie_titles.txt", 'r')
movieNames = movieTitles.read()
movieList = movieNames.splitlines()
movieTitles = []
for i in range(int(len(movieList))):
    movieTitles.append([int(movieList[i].split(",")[0]),(movieList[i].split(",")[1]),",".join(movieList[i].split(",")[2:])])


movieTitles = pd.DataFrame(movieTitles, columns = ["MovieID","Year","Title"])
movieTitles = movieTitles.set_index("MovieID")



In [3]:
testRatings = testRatings.rename(columns = {0:"MovieID",1:"UserId",2:"Ratings"})
trainRatings = trainRatings.rename(columns = {0:"MovieID",1:"UserId",2:"Ratings"})

### Splitting the training dataset into Train(70%) and Validation(30%)

In [4]:
trainRatings,validateRatings = train_test_split(trainRatings, test_size=0.3)

In [5]:
userSumCount = trainRatings.groupby(["UserId"]).agg(['sum','count'])["Ratings"]
userSumCount.reset_index() 

Unnamed: 0,UserId,sum,count
0,7,241.0,64
1,79,232.0,64
2,199,184.0,46
3,481,191.0,44
4,769,231.0,74
...,...,...,...
28973,2648869,272.0,74
28974,2648885,343.0,87
28975,2649120,195.0,51
28976,2649267,219.0,55


### Calculating the Vi bar variable for all the user in our train dataset who have voted for more than 0.5% of the total movies in the movie titles dataset.


In [6]:
trainData = userSumCount[userSumCount['count'] >= (0.006 * len(movieTitles))]  #Trim the dataset to select users who have voted for more than 0.6% of the total movie dataset provided.

trainData = userSumCount
vi_bar = (trainData["sum"] / trainData["count"]).to_frame().reset_index()
vi_bar = vi_bar.rename(columns={0:"meanVote"})


trainData = pd.merge(trainData, trainRatings, how ='inner', on =['UserId', 'UserId'])

# trainData
# vi_bar

### Creating a User vs Movie rating dataframe which will be used to perform further operations. This dataframe consists of UserIds in its columns and Movie Id as the index.
### With a combination of User Id and Movie Id, we can extract the ratings of the user for that particular user

In [7]:
userMovieRatings = pd.merge(trainData, movieTitles, how ='inner', on =['MovieID', 'MovieID'])

userMovieRatings = userMovieRatings.pivot_table(index = ["MovieID"],columns = ["UserId"],values = "Ratings").fillna(0)

userMovieRatings

UserId,7,79,199,481,769,906,1310,1333,1427,1442,...,2648572,2648589,2648730,2648734,2648853,2648869,2648885,2649120,2649267,2649285
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28,4.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,4.0,...,0.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17725,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0
17728,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17734,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### We will be using the KNN classifier from SKLearn to find the nearest neighbours of our user. Prediction of the user vote will be done wrt the user's neighbours

In [8]:
def get_Neighbours(userId, movieId, df = userMovieRatings):
    userVotedMovieList = userMovieRatings[userMovieRatings[userId] != 0]
    userMovieMatrix = userVotedMovieList.T.to_numpy()
    model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'auto')
    model_knn.fit(userMovieMatrix)
    distances, indices = model_knn.kneighbors(userVotedMovieList.T.loc[userId,:].values.reshape(1,-1), n_neighbors = 100)
    ids = vi_bar['UserId'].iloc[indices.tolist()[0]].to_numpy()
    ids = ids[1:,]
    return indices, ids



In [9]:
def CollaborativeFiltering(dataset, count = 0):

    trueValues = []
    predictedValues = []
    RoundpredictedValues = []

    if count != 0:
        dataset = dataset.head(count)

    for i in range(dataset.shape[0]):
        print(i)

        summation = 0
        wai = []

        userId = int(dataset.iloc[i].UserId)
        movieId = int(dataset.iloc[i].MovieID)
        
        indices, ids = get_Neighbours(userId,movieId)

        activeUserMovieList = userMovieRatings[userId]
        activeUserMeanVote = activeUserMovieList -  vi_bar[vi_bar.UserId == userId]["meanVote"].values[0]

        for k in indices[0]:

            neighbourUser = vi_bar['UserId'].iloc[k]
            neighborMovieList =  userMovieRatings[neighbourUser]
            neighborMeanVote = userMovieRatings[neighbourUser] -  vi_bar[vi_bar.UserId == neighbourUser]["meanVote"].values[0]
            data = (pd.concat([activeUserMovieList, neighborMovieList, activeUserMeanVote, neighborMeanVote], axis=1)).to_numpy()
            
            # Deleting the rows where only either the active user or the neighbour has voted.
            # In the result matrix, we will get the votes and mean votes of the common movies the active user and his neighbour has voted on
            data = data[~np.any(data[:,:2] == 0, axis=1)]             
            
            
            #Calculating the weigh(Wai) of the active user and its neighbour
            numeratorSum = np.sum(np.multiply(data[:,2],data[:,3]))
            denominatorValue1 = np.sum(np.multiply(data[:,2],data[:,2]))
            denominatorValue2 = np.sum(np.multiply(data[:,3],data[:,3]))
            denominator = math.sqrt(denominatorValue1 * denominatorValue2)
            
            if denominator == 0:
                continue
            
            tempWeightOfUserIandA = numeratorSum / denominator
            summation += tempWeightOfUserIandA * (neighborMeanVote[movieId])
            wai.append(abs(tempWeightOfUserIandA))

        # print(summation)
        waiArray = np.array(wai)

        if waiArray.sum() == 0:
            summation = 0
            predictedVote = (vi_bar[vi_bar["UserId"] == userId].meanVote.values[0]) 
        else:
            predictedVote = (vi_bar[vi_bar["UserId"] == userId].meanVote.values[0]) + (abs(1/waiArray.sum()))*summation

        RoundPredictedVote = round(predictedVote)
        PredictedVote = predictedVote


        # print(predictedVote , " Actual : ",dataset.iloc[i].Ratings)

        trueValues.append(dataset.iloc[i].Ratings)
        predictedValues.append(predictedVote)
        RoundpredictedValues.append(RoundPredictedVote)
        
    # print(trueValues)
    # print(predictedValues)
    rmse =  mean_squared_error(trueValues,predictedValues,  squared=False)
    print("RMSE (without rounding off the predicted resulted): ", rmse)
    mae = mean_absolute_error(trueValues, predictedValues)
    print("MAE (without rounding off the predicted resulted): ", mae)
    
    rmse =  mean_squared_error(trueValues,RoundpredictedValues,  squared=False)
    print("RMSE (with rounding off the predicted resulted): ", rmse)
    mae = mean_absolute_error(trueValues, RoundpredictedValues)
    print("MAE (with rounding off the predicted resulted): ", mae)

    



In [10]:
CollaborativeFiltering(testRatings,5000) # Do not pass any value in the count parameter to run Collaborative filtering on the entire dataset

0
1.1778861342916764  Actual :  1.0
1
1.1504806417690943  Actual :  3.0
2
0.0703723261341116  Actual :  3.0
3
0.2755502385381874  Actual :  3.0
4
0.7164990298775042  Actual :  1.0
5
0.48790377790925543  Actual :  4.0
6
0.7286788663466837  Actual :  4.0
7
1.5368111680713619  Actual :  4.0
8
1.373144827564448  Actual :  4.0
9
0.6535605906370967  Actual :  2.0
10
0.13136408862928217  Actual :  3.0
11
-1.1341904616813119  Actual :  1.0
12
0.11459525014808047  Actual :  4.0
13
1.0  Actual :  1.0
14
1.124653180238647  Actual :  5.0
15
0.6076138811916234  Actual :  3.0
16
1.1545593447509006  Actual :  4.0
17
0.770031653212381  Actual :  5.0
18
0.7321567106445359  Actual :  4.0
19
0.6874527659790157  Actual :  1.0
20
1.2678166390881298  Actual :  4.0
21
1.122754043511966  Actual :  3.0
22
1.2203779128397287  Actual :  5.0
23
0.6490730047942801  Actual :  4.0
24
1.3130431188619993  Actual :  4.0
25
0.6208253902100371  Actual :  3.0
26
1.04228289050518  Actual :  2.0
27
0.7876913801089356  Actua

## Q2: Neural Networks, K-nearest neighbors and SVMs

### Fetching the MNIST dataset and dividing it into Test and Train datasets

In [22]:
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
X = X / 255.

X_train, X_test = X[:60000], X[60000:]
y_train, y_test = y[:60000], y[60000:]

### SVM Classifier:


In [19]:
params = [  {'C':[0.00001, 0.0001, 0.001, 0.01 , 0.1, 1, 10, 100, 1000,10000], 'kernel':['linear']},
            {'C':[0.001, 0.01 , 0.1, 1, 10], 'kernel':['rbf'], 'gamma':[10.0 , 1.0 , 0.001 , 0.0001]},
            {'C':[0.001, 0.01 , 0.1, 1, 10], 'kernel':['poly'], 'degree': [2,3,4] ,'gamma':[10.0 , 1.0 , 0.001 , 0.0001], 'coef0' : [0.1 , 0.01, 0]},
            {'C' :[0.001, 0.01 , 0.1, 1, 10], 'kernel':['sigmoid'], 'gamma':[10.0 , 1.0 , 0.001 , 0.0001], 'coef0' : [0.1 , 0.01, 0]}
        ]

svc=SVC()
for i in range(4):
        
        print("Executing for : ", params[i])
        GS = GridSearchCV(estimator = svc, param_grid = params[i], scoring = 'accuracy', cv = 5, n_jobs=-1, verbose=1)
        GS.fit(X_train[:1000], y_train[:1000])

        print(pd.DataFrame(GS.cv_results_))
        pd.DataFrame(GS.cv_results_).to_csv('SVMClassifierResults.csv',mode='a' ,index=False)
        print("Best Score achieved : {:.4f}".format(GS.best_score_))
        print("Parameters that give the best results :",(GS.best_params_))
        score = GS.score(X_test, y_test)
        print('Score : {0:0.4f}'.format(score))
        print('Error Rate: {0:0.4f}'.format(1 - score))
        print()



Executing for :  {'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 'kernel': ['linear']}
Fitting 5 folds for each of 10 candidates, totalling 50 fits
   mean_fit_time  std_fit_time  mean_score_time  std_score_time  param_C  \
0       1.924859      0.287053         0.554355        0.079446  0.00001   
1       2.047774      0.374928         0.516434        0.116720   0.0001   
2       2.073642      0.153148         0.385570        0.021359    0.001   
3       0.595169      0.475490         0.284839        0.013596     0.01   
4       0.474730      0.082471         0.252326        0.014221      0.1   
5       0.418680      0.103212         0.224201        0.017172        1   
6       0.364225      0.070327         0.238363        0.016231       10   
7       0.452587      0.052988         0.228589        0.006353      100   
8       0.354651      0.023114         0.230982        0.004660     1000   
9       0.280852      0.042440         0.175931        0.053058    10000  

### MLP Classifier:

In [24]:
params = [ {'hidden_layer_sizes':[(10,5), (5,2)], 'activation':['relu','logistic','tanh','identity'],'solver':['lbfgs', 'sgd', 'adam'],'alpha':[0.001, 0.1 , 10 , 50] , 'verbose':[1]}]

mlp = MLPClassifier()
GS = GridSearchCV(estimator = mlp, param_grid = params[0], scoring = 'accuracy', cv = 5, n_jobs=-1, verbose=1)
GS.fit(X_train[:1000], y_train[:1000])

print(pd.DataFrame(GS.cv_results_))
pd.DataFrame(GS.cv_results_).to_csv('MLPClassifierResults.csv', mode='a', index=False)

print("Best Score achieved : {:.4f}".format(GS.best_score_))
print("Parameters that give the best results :",(GS.best_params_))
score = GS.score(X_test, y_test)
print('Score : {0:0.4f}'.format(score))
print('Error Rate: {0:0.4f}'.format(1 - score))

Fitting 5 folds for each of 96 candidates, totalling 480 fits
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0        5.694499      0.781283         0.023939        0.004038   
1       12.838562      1.136323         0.023338        0.008662   
2       12.761119      0.780686         0.019350        0.004486   
3        2.459246      1.889012         0.024137        0.012384   
4        9.961001      0.328976         0.017755        0.002707   
..            ...           ...              ...             ...   
91      12.676273      1.118149         0.020546        0.002572   
92      13.044562      1.117169         0.038895        0.020006   
93       3.054449      0.971486         0.032914        0.018280   
94      13.462917      1.114264         0.024138        0.011907   
95       9.550546      1.978941         0.013165        0.006030   

   param_activation param_alpha param_hidden_layer_sizes param_solver  \
0              relu       0.001                 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


### KNN Classifier:

In [25]:
params = [ {'n_neighbors':[5, 50, 200], 'weights':['uniform','distance'],'algorithm':['ball_tree', 'kd_tree', 'brute'],'p':[1,2]}]

knn = KNeighborsClassifier()
GS = GridSearchCV(estimator = knn, param_grid = params[0], scoring = 'accuracy', cv = 5, n_jobs=-1, verbose=1)
GS.fit(X_train[:1000], y_train[:1000])

print(pd.DataFrame(GS.cv_results_))
pd.DataFrame(GS.cv_results_).to_csv('KNNClassifierResults.csv',  mode='a',index=False)

print("Best Score achieved : {:.4f}".format(GS.best_score_))
print("Parameters that give the best results :",(GS.best_params_))
score = GS.score(X_test, y_test)
print('Score : {0:0.4f}'.format(score))
print('Error Rate: {0:0.4f}'.format(1 - score))

Fitting 5 folds for each of 36 candidates, totalling 180 fits
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0        0.162475      0.061252         0.698167        0.047281   
1        0.168685      0.053060         0.644358        0.072315   
2        0.148801      0.028120         0.594611        0.043144   
3        0.127060      0.036272         0.547535        0.063451   
4        0.130452      0.018318         0.611307        0.098851   
5        0.161767      0.031788         0.691458        0.091574   
6        0.124507      0.016488         0.524899        0.064217   
7        0.118387      0.010224         0.530800        0.047690   
8        0.134639      0.020987         0.583039        0.055088   
9        0.115688      0.015707         0.615773        0.198080   
10       0.116687      0.010152         0.726743        0.088444   
11       0.246054      0.016641         1.029330        0.291806   
12       0.209050      0.103201         0.985936      