In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Climate Change Classification Predict Solution

### Honour Code

I {**TEAM_JM1**}, confirm - by submitting this document - that the solutions in this notebook are a result of my own work and that I abide by the [EDSA honour code](https://drive.google.com/file/d/1QDCjGZJ8-FmJE3bZdIQNwnJyQKPhHZBn/view?usp=sharing).

Non-compliance with the honour code constitutes a material breach of contract.

### Predict Overview: Movie Recommendation System

Recommender systems are socially and economically critical to ensure that individuals can make optimised choices surrounding the content they engage with on a daily basis. Hence, providing an accurate and robust solution to this challenge has immense economic potential, with users of the system being personalised recommendations . Our team has been tasked to:

- 1. analyse the supplied data;
- 2. clean the text data provided;
- 3. determine if additional features can be added to enrich the data set;
- 4. build a model that is capable of recommending a movie to a new user or continual user based on their consumption patterns;
- 5. evaluate the accuracy of the best machine learning model;
- 6. determine what features were most important in the model’s prediction decision, and
- 7. explain the inner working of the model to a non-technical audience.

<a id ="cont"></a>

## Table of Contents

<a href=#one>1. Importing Packages</a>

<a href=#four>4. Exploratory Data Analysis (EDA)</a>

<a href=#five>5. Data Engineering</a>

<a href=#six>6. Modeling</a>

<a href=#seven>7. Model Performance</a>

<a href=#eight>8. Model Explanations</a>

<a href=#nine>9. Model Submission</a>

## Connect to Comet

We will use the Comet platform as a version control platform for this project. We will start an experiment that will record the whole process of our model deployement.

In [2]:
# # Import comet_ml at the top of your file
# from comet_ml import Experiment

# # Create an experiment with your api key
# experiment = Experiment(
#     api_key="914jqxof7HD2vT3iISOxY4IkM",
#     project_name="team-jm6",
#     workspace="oluyemi",
# )

# # Run your code and go to /

 <a id="one"></a>
## 1. Importing Packages
<a href=#cont>Back to Table of Contents</a>


In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from wordcloud import WordCloud, STOPWORDS
import os      
import surprise
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import train_test_split
import time
from surprise import SVD
from surprise import accuracy
import re
import plotly.express as px
import scipy as sp
from wordcloud import WordCloud, STOPWORDS
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline  import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, Normalizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.manifold import TSNE

The required movie data and metadata has been provided and will be loaded for the purpose of our model building

In [4]:
#Loading all required files into dataframes
train_df = pd.read_csv('/kaggle/input/edsa-movie-recommendation-2022/train.csv')
test_df = pd.read_csv('/kaggle/input/edsa-movie-recommendation-2022/test.csv')
tags_df = pd.read_csv('/kaggle/input/edsa-movie-recommendation-2022/tags.csv')
movies_df = pd.read_csv('/kaggle/input/edsa-movie-recommendation-2022/movies.csv')
links_df = pd.read_csv('/kaggle/input/edsa-movie-recommendation-2022/links.csv')
imdb_df = pd.read_csv('/kaggle/input/edsa-movie-recommendation-2022/imdb_data.csv')
genome_tags = pd.read_csv('/kaggle/input/edsa-movie-recommendation-2022/genome_tags.csv')
genome_score = pd.read_csv('/kaggle/input/edsa-movie-recommendation-2022/genome_scores.csv')

In [5]:
# #Creating a Summary function
# def Summary(df):
#     return df.info()

We will take a look at our **train** data and also our **movies** data

In [6]:
train_df.info()

In [7]:
#Displaying the top 5 items in our train dataset
train_df.head()

In [8]:
# Created a Data Frame outlining the size of our data
dataframes = ['train_df', 'test_df', 'tags_df', 'imdb_df',
              'links_df', 'movies_df', 'genome_tags', 'genome_score']
sizes = [len(train_df), len(test_df), len(tags_df),
         len(imdb_df), len(links_df), len(movies_df),
         len(genome_tags), len(genome_score)]
total_size_df = pd.DataFrame(list(zip(dataframes, sizes)),
                             columns=['dataframe', 'sizes'])
total_size_df

From the sizes of the different datasets available, it is evident how huge our task mayy be. The dataframe sizes would require resource intensive operations that we have to keep in mind. 

In [9]:
total_size_df = total_size_df[total_size_df['sizes'] > 100000]
total_size_df

In [10]:
new_row = {'dataframe': 'other', 'sizes': 180530}
total_size_df = total_size_df.append(new_row,
                                     ignore_index=True)
total_size_df

<a id="four"></a>
## 4. Exploratory Data Analysis (EDA)
<a class="anchor" id="1.1"></a>
<a href=#cont>Back to Table of Contents</a>


In [11]:
explodeTuple = (0.05, 0.04, 0.05, 0.04, 0.6)
fig1, ax1 = plt.subplots(figsize=(14,7))
ax1.pie(total_size_df['sizes'].values,
        labels=total_size_df['dataframe'].values,
        startangle=90, autopct='%1.1f%%',
        explode=explodeTuple)
ax1.axis('equal')
plt.title('Distribution of overall Data Frames')
plt.show()

In [12]:
len_list = [['train_df', len(train_df)], ['tags_df', len(tags_df)],
            ['imdb_df', len(imdb_df)], ['links_df', len(links_df)],
            ['movies_df', len(movies_df)],
            ['genome_tags', len(genome_tags)],
            ['genome_score', len(genome_score)]]
len_df = pd.DataFrame(len_list,
                      columns=['Dataset', 'Size'])
fig = px.bar(len_df, x=len_df['Dataset'],
             y=len_df['Size'],
             color=len_df['Dataset'],
             title='Distribution of overall Data Frames')
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))
fig.show()


We will take a look at the the datasets  available and check for any null values present.

In [13]:
# Obtaining the total null values in each Data Frames columns
train_count = pd.DataFrame(train_df.isnull().sum())
test_count = pd.DataFrame(test_df.isnull().sum())
tags_count = pd.DataFrame(tags_df.isnull().sum())
movies_count = pd.DataFrame(movies_df.isnull().sum())
links_count = pd.DataFrame(links_df.isnull().sum())
imdb_count = pd.DataFrame(imdb_df.isnull().sum())
genomet_count = pd.DataFrame(genome_tags.isnull().sum())
genomes_count = pd.DataFrame(genome_score.isnull().sum())

In [14]:
train_count

In [15]:
test_count

In [16]:
plt.bar(tags_count.index,
        tags_count.values.reshape(len(tags_count), ),
        color='red')
plt.xlabel('column_name')
plt.ylabel('count')
plt.title('Null value count in tags_df')
plt.show()

### Visualizing common users



In [17]:
# To find the number of times a user rated a movie, we create a data frame with the count by userId
train_user = pd.DataFrame(
   
    train_df['userId'].value_counts()

).reset_index()

train_user.rename(columns={'index':'userId','userId':'count'},
                  inplace=True)
train_user.head()

In [18]:
# Grouping the users within a certain range aided us in determining the common userId's and the new ones.
group_one = train_user.loc[(train_user['count'] > 0) & 
            (train_user['count'] < 50),
            'userId'].value_counts().sum()
group_two = train_user.loc[(train_user['count'] >= 50) & 
            (train_user['count'] < 500),
            'userId'].value_counts().sum()
group_three = train_user.loc[(train_user['count'] >= 500) & 
            (train_user['count'] < 1000),
            'userId'].value_counts().sum()
group_four = train_user.loc[(train_user['count'] >= 1000) & 
            (train_user['count'] < 1500),
            'userId'].value_counts().sum()
group_five = train_user.loc[(train_user['count'] >= 1500),
            'userId'].value_counts().sum()

In [19]:
# To give us insight in the spread, we used figures to determine the spread.
trial_error = np.array([['group_one', group_one,
                         'between 1 and 50'],
                        ['group_two', group_two,
                         'between 50 and 500'],
                        ['group_three', group_three,
                         'between 500 and 1000'],
                        ['group_four', group_four,
                         'between 1000 and 1500'],
                        ['group_five', group_five,
                         'greater than 1500']])
trial_error_df = pd.DataFrame({'group': trial_error[:, 0],
                               'userId_grouping': trial_error[:, 1],
                               'explanation': trial_error[:, 2]})
fig = px.bar(trial_error_df,
             x=trial_error_df["group"],
             y=trial_error_df["userId_grouping"],
             color=trial_error_df["group"],
             title='Grouped Rating Distribustion')
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))
fig.show()
trial_error_df

In [20]:
def user_ratings_count(df, n):
    plt.figure(figsize=(14,7))
    data = df['userId'].value_counts().head(n)
    ax = sns.barplot(x = data.index, y = data, order= data.index, palette='CMRmap', edgecolor="black")
    for p in ax.patches:
        ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % int(p.get_height()), fontsize=11, ha='center', va='bottom')
    plt.title(f'Top {n} Users by Number of Ratings', fontsize=14)
    plt.xlabel('User ID')
    plt.ylabel('Number of Ratings')
    plt.show()

In [21]:
user_ratings_count(train_df,10)

### Exploring Movie Genres

In [22]:
genres = pd.DataFrame(movies_df['genres'].
                      str.split("|").
                      tolist(),
                      index=movies_df['movieId']).stack()
genres = genres.reset_index([0, 'movieId'])
genres.columns = ['movieId', 'Genre']
genres.head()

In [23]:
fig, ax = plt.subplots(figsize=(14, 7))
sns.countplot(x='Genre',
              data=genres,
              palette='CMRmap',
              order=genres['Genre'].
              value_counts().index)
plt.xticks(rotation=90)
plt.xlabel('Genre', size=20)
plt.ylabel('Count', size=20)
plt.title('Distribution of Movie Genres', size=25)
plt.show()

### Exploring the Movies Data

In [24]:
movies=pd.merge(train_df, movies_df,on='movieId',how='inner')
movies.head()

In [25]:
full_movies = pd.merge(movies,imdb_df,on='movieId',how='inner')
full_movies.head()

Let's look at the movies that received recieved that most individual ratings from users

In [26]:
def top_n_plot_by_ratings(df,column, n):
    '''
    This function takes in a dataframe, a columns(mostly a movie title column) and a number(The number of top movies to include in visualisation), computes
    and display the top n movie by number of ratings received 
    '''
    plt.figure(figsize=(14,7))
    data = df[str(column)].value_counts().head(n)
    ax = sns.barplot(x = data.index, y = data, order= data.index, palette='CMRmap', edgecolor="black")
    for p in ax.patches:
        ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % int(p.get_height()), fontsize=11, ha='center', va='bottom')
    plt.title(f'Top {n} {column.title()} by Number of Ratings', fontsize=14)
    plt.xlabel(column.title())
    plt.ylabel('Number of Ratings')
    plt.xticks(rotation=90)
    plt.show()

In [27]:
#Top 15 movies by number of ratings received
top_n_plot_by_ratings(movies,'title',15)

In [28]:
# Wordcloud of movie titles
movies_word = movies_df['title'] = movies_df['title'].astype('str')
movies_wordcloud = ' '.join(movies_word)
title_wordcloud = WordCloud(stopwords = STOPWORDS,
                            background_color = 'White',
                            height = 1200,
                            width = 900).generate(movies_wordcloud)
plt.figure(figsize = (14,7), facecolor=None)
plt.imshow(title_wordcloud)
plt.axis('off')
plt.title('Distribution of words from movie titles')
plt.tight_layout(pad=0)
plt.show()

In [29]:
top_n_plot_by_ratings(movies,'rating',10)

In [30]:
top_n_plot_by_ratings(full_movies,'director',15)

Now we want to look at the distribution of the ratings provided. We want to see which percentage of the total ratings provided by users does a particular rating take up.i.e The number of users per any Rating.

In [31]:
movieRatingDistGroup = train_df['rating'].value_counts().sort_index().reset_index()
fig, ax = plt.subplots(figsize=(14,7))
sns.barplot(data=movieRatingDistGroup, x='index', y='rating', palette="CMRmap", edgecolor="black", ax=ax)
ax.set_xlabel("Rating")
ax.set_ylabel('Number of Users')
ax.set_yticklabels(['{:,}'.format(int(x)) for x in ax.get_yticks().tolist()])
total = float(movieRatingDistGroup['rating'].sum())
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2., height+350, '{0:.2%}'.format(height/total), fontsize=11, ha="center", va='bottom')
plt.title('Number of Users Per Rating', fontsize=14)
plt.show()

In [32]:
def count_directors(df, count = 10):
    """
    Function to count the most common dircetors in a DataFrame:
    Parameters
    ----------
        df (DataFrame): input dataframe containing imdb metadata
        count (int): filter directors with fewer than count films
        
    Returns
    -------
        directors (DataFrame): output DataFrame
    Examples
    --------
        >>> df = pd.DataFrame({'imdbid':[0,1,2,3,4,5], 'director': [A,B,A,C,B]})
        >>> count_directors(df, count = 1)
            |index|director|count|
            |0|A|2|
            |1|B|2|
            |2|C|1|
    """
    directors = pd.DataFrame(df['director'].value_counts()).reset_index()
    directors.columns = ['director', 'count']
    # Lets only take directors who have made 10 or more movies otherwise we will have to analyze 11000 directors
    directors = directors[directors['count']>=count]
    return directors.sort_values('count', ascending = False)

In [33]:
def feature_count(df, column):
    plt.figure(figsize=(14,7))
    ax = sns.barplot(x = df[f'{column}'], y= df['count'], palette='brg')
    for p in ax.patches:
        ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % int(p.get_height()), fontsize=11, ha='center', va='bottom')
    plt.title(f'Number of Movies Per {column}', fontsize=14)
    plt.xlabel(f'{column}')
    plt.ylabel('Count')
    plt.xticks(rotation=90)
    plt.show()

In [34]:
directors = count_directors(imdb_df)
feature_count(directors.head(15), 'director')

In [None]:
feature_count(directors[directors['director']!='See full summary'].head(15), 'director')

### Word Clouds

In [35]:
comment_words = ''
stopwords = set(STOPWORDS)

# iterate through the csv file
for val in tags_df['tag']:

    # typecaste each val to string
    val = str(val)

    # split the value
    tokens = val.split()

    # Converts each token into lowercase
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()

    comment_words += " ".join(tokens)+" "
  
wordcloud = WordCloud(width=1200, height=900,
                      colormap='winter',
                      background_color='white',
                      stopwords=stopwords,collocations=False,
                      min_font_size=10).generate(comment_words)

# plot the WordCloud image
plt.figure(figsize=(14, 7), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.title('Distribution of words in the tags data frame by Tags')
plt.tight_layout(pad=0)

plt.show()

The WORDCLOUD here shows the most common words present in the tags dataset

In [36]:
#Total count of all 
value_count = pd.DataFrame(tags_df['tag'].
                           value_counts()).reset_index()
value_count.rename(columns = {'index': 'genre', 'tag': 'count'},
                   inplace = True)

In [37]:
value_count.head()

In [None]:
genre_count = value_count.head(20)
plt.figure(figsize=(14,7))
ax = sns.barplot(x = genre_count['genre'], y= genre_count['count'], palette='CMRmap')
for p in ax.patches:
        ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % int(p.get_height()), fontsize=11, ha='center', va='bottom')
plt.title('Number of times a genre tag appears', fontsize=14)
plt.xlabel('Genre tag')
plt.ylabel('Genre tag Count')
plt.xticks(rotation=90)
plt.show()

## Publishing Years

In [None]:
dates = []
for title in movies_df['title']:
    if title[-1] == " ":
        year = title[-6: -2]
        try:
            dates.append(int(year))
        except:
            dates.append(9999)
    else:
        year = title[-5: -1]
        try:
            dates.append(int(year))
        except:
            dates.append(9999)

movies_df['Publish Year'] = dates

In [None]:
dates = []
for title in movies_df['title']:
    if title[-1] == " ":
        year = title[-6: -2]
        try:
            dates.append(int(year))
        except:
            dates.append(9999)
    else:
        year = title[-5: -1]
        try:
            dates.append(int(year))
        except:
            dates.append(9999)

movies_df['Publish Year'] = dates

In [None]:
len(movies_df)

In [None]:
len(movies_df[movies_df['Publish Year'] == 9999])

In [None]:
movies_df[(movies_df['Publish Year'] > 1888) &
          (movies_df['Publish Year'] < 2021)]

In [None]:
dataset = pd.DataFrame(movies_df['Publish Year'].
                       value_counts()).reset_index()
dataset.rename(columns={'index': 'year', 'Publish Year': 'count'},
               inplace=True)
dataset.head()

In [None]:
year_dataset = dataset[(dataset['year']>1888) & (dataset['year']<2021)].sort_values(by='count',ascending=False).head(50)
plt.figure(figsize=(14,7))
ax = sns.barplot(x = year_dataset['year'], y= year_dataset['count'], order=year_dataset['year'], palette='CMRmap')
#for p in ax.patches:
#       ax.text(p.get_x() + p.get_width()/2., p.get_height(), '%d' % int(p.get_height()), fontsize=11, ha='center', va='bottom')
plt.title('Number of Movies Released Per year', fontsize=14)
plt.xlabel('year')
plt.ylabel('Released Movie Count')
plt.xticks(rotation=90)
plt.show()

## Budget

In [None]:
new_l = list(imdb_df['budget'])
print(type(new_l[9]))

imdb_df['runtime'] = imdb_df['runtime'].fillna(imdb_df['runtime'].mean())
imdb_df.isnull().sum() #data cleaning
imdb_df.head()
imdb_df['budget'] = imdb_df['budget'].str.replace('[\,]', '', regex=True)

In [None]:
def clean_txt(text):
    text = re.sub(r'[0-9]+', "", str(text))
    return text
imdb_df['currency'] = imdb_df['budget'].apply(clean_txt)
imdb_df.head()

In [None]:
currencies = list(imdb_df['currency'])
# Number of currencies
len(set(currencies))-1

currencies_count_df = pd.DataFrame(imdb_df['currency'].
                                   value_counts()).reset_index()
currencies_count_df.rename(columns={'index': 'currency', 'currency': 'count'},
                           inplace=True)
currencies_count_df.head()

fig = px.bar(currencies_count_df, x=currencies_count_df['currency'],
             y=currencies_count_df['count'],
             color=currencies_count_df['currency'],
             title='Currency Type Distribution')
fig.show()

## Preprocessing

In [None]:
def data_scaler(df):
    """
    Scales data.
    """
    scaler = StandardScaler(with_std=True)
    scaled_data = scaler.fit_transform(df)
    return scaled_data

In [None]:
genome= genome_score[:10000000]
genome

In [None]:
scaled_genome = data_scaler(genome.sample(frac=0.0001))

In [None]:
tsne = TSNE(3, n_jobs = -1, verbose = 2, perplexity = 10, learning_rate = 0.1)
tsne.fit(scaled_genome)

In [None]:
#Axes3D

fig = plt.figure(figsize=(14, 7))

# Add 3D scatter plot
ax = fig.add_subplot(projection='3d')
ax.scatter(tsne.embedding_[:,0], tsne.embedding_[:,1], tsne.embedding_[:,2])
plt.show()

In [None]:
fig = plt.figure(figsize=(14, 7))
sns.scatterplot(x = tsne.embedding_[:,0], y = tsne.embedding_[:,1], size=tsne.embedding_[:,2])
plt.show()

In [None]:
# Below is the dataframe we will be altering.
working_train = train_df.drop(columns='timestamp')
working_train.head()

In [None]:
df_work = working_train.set_index('movieId').join([movies_df[['movieId',
                                                           'genres']]
                                                   .set_index('movieId'),
                                                   imdb_df[['movieId',
                                                         'title_cast',
                                                         'director',
                                                         'plot_keywords']].
                                                   set_index('movieId')],
                                                  how='left').reset_index()
df_work.head()

In [None]:
test_df.head()

In [None]:
def preprocessor_train(df):
    working_train = df.copy()

    # Merge
    df_work = working_train.set_index('movieId').join([movies_df
                                                       [['movieId', 'genres']].
                                                       set_index('movieId'),
                                                       imdb_df[['movieId',
                                                             'title_cast',
                                                             'director',
                                                             'plot_keywords']].
                                                       set_index('movieId')],
                                                      how='left').reset_index()

# '(no genre listed)' is an equivalent of a missing value in the column genres

    df_work['genres'] = ['' if x == '(no genres listed)' else x for x in df_work['genres']]

    # filling missing values with 'nothing'... (emptying...?)
    df_work.fillna('', inplace=True)

    for col in df_work.select_dtypes('object').columns: # selecting 'object' columns

        # removing white space
        df_work[col] = [''.join(x.split()) for x in df_work[col]]

        # substituting '|' with a white space
        df_work[col] = [' '.join(x.split('|')) for x in df_work[col]]

    # joining the features of interest
    df_work['corpus'] =  df_work[df_work.select_dtypes('object').columns].apply(lambda x: ' '.join(x), axis=1)
    return df_work[['movieId', 'userId', 'corpus', 'rating']]


In [None]:
def preprocessor_test(df):
    working_train = df.copy()

    # Merge
    df_work = working_train.set_index('movieId').join([movies_df
                                                       [['movieId', 'genres']].
                                                       set_index('movieId'),
                                                       imdb_df[['movieId',
                                                             'title_cast',
                                                             'director',
                                                             'plot_keywords']].
                                                       set_index('movieId')],
                                                      how='left').reset_index()

# '(no genre listed)' is an equivalent of a missing value in the column genres

    df_work['genres'] = ['' if x == '(no genres listed)' else x for x in df_work['genres']]

    # filling missing values with 'nothing'... (emptying...?)
    df_work.fillna('', inplace=True)

    for col in df_work.select_dtypes('object').columns: # selecting 'object' columns

        # removing white space
        df_work[col] = [''.join(x.split()) for x in df_work[col]]

        # substituting '|' with a white space
        df_work[col] = [' '.join(x.split('|')) for x in df_work[col]]

    # joining the features of interest
    df_work['corpus'] =  df_work[df_work.select_dtypes('object').columns].apply(lambda x: ' '.join(x), axis=1)
    return df_work[['movieId', 'userId', 'corpus']]

In [None]:
Test = preprocessor_test(test_df)


In [None]:
Test.info

In [None]:
Train = preprocessor_train(train_df.drop(columns=['timestamp'])) # DO NOT RUN THIS ON LOCAL COMPUTER

In [None]:
Train_1 = Train.drop(columns=['rating', 'userId', 'movieId'])
Test_1 = Test.drop(columns=['userId', 'movieId'])

In [None]:
Test_1.shape

In [None]:
Train_1.shape

In [None]:
Train_1.head()

In [None]:
Test_1.head()

In [None]:
y = Train['rating']

In [None]:
cv = CountVectorizer()
Train_mat =cv.fit_transform(Train_1['corpus'])

In [None]:
Test_mat = cv.transform(Test_1['corpus'])

In [None]:
Train_mat.shape

In [None]:
Test_mat.shape

## Modelling

### Content Based Filtering 

#### Linear Regression 

### Collaborative Based Filtering

In [None]:
# Load the 1M dataset
train_df.drop('timestamp', axis=1)
train_subset = train_df[:1000000]
reader = Reader(rating_scale=(train_subset['rating'].min(), train_subset['rating'].max()))
data = Dataset.load_from_df(train_subset[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=.25, random_state=42)

In [None]:
train_df.head()

In [None]:
test_df.head()

#### Singular Value Decomposition (SVD)

In [None]:
svd_model = SVD(n_epochs=50,n_factors=400,init_std_dev=0.001,random_state=42,verbose=True)
svd_model.fit(trainset)
svd_predictions = svd_model.test(testset)
svd_rmse = accuracy.rmse(svd_predictions)

In [None]:
# Dictionary for logging SVD model on comet
params = {'model_name': 'SVD'}
metrics = {'RMSE': svd_rmse}

# Log the parameters and results for the SVD model
experiment.log_parameters(params)
experiment.log_parameters(metrics)
# End the experiment for the SVD experiment
experiment.end()

## Hyperparameter Tuning

In [None]:
# Load the 1M dataset
train_df.drop('timestamp', axis=1)
#train_subset = train_df[:1000000] remove
reader = Reader(rating_scale=(train_df['rating'].min(), train_df['rating'].max()))
data = Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=.30, random_state=42)

In [None]:
svd_model_hyper = SVD(n_epochs=50,n_factors=250,init_std_dev=0.001,random_state=42,verbose=True)
svd_model_hyper.fit(trainset)
svd_predictions_hyper = svd_model_hyper.test(testset)
svd_rmse_hyper = accuracy.rmse(svd_predictions_hyper)

In [None]:
# Dictionary for logging the tuned SVD model on comet
params = {'model_name': 'SVD'}
metrics = {'RMSE': svd_rmse_hyper}

# Log the parameters and results for the SVD model
experiment.log_parameters(params)
experiment.log_parameters(metrics)
# End the experiment for the SVD experiment
experiment.end()

### Kaggle Submission

In [None]:
#svd_model = SVD(n_epochs=50,n_factors=400,init_std_dev=0.001,random_state=42,verbose=True)
#svd_model.fit(trainset)
svd_predictions_test = svd_model.test(Test)
svd_rmse_test = accuracy.rmse(svd_predictions_test)

In [None]:
test_df.userId=test_df['userId'].astype(int)
test_df.movieId=test_df['movieId'].astype(int)

In [None]:
test_df['Id']=test_df.apply(lambda x:'%s_%s' % (x['userId'],x['movieId']),axis=1)
test_df['Id']=test_df.apply(lambda x:'%s_%s' % (x['userId'],x['movieId']),axis=1)

In [None]:
test_df["rating"] = test_df.apply(
    lambda x: svd_model.predict(x["userId"], x["movieId"]).est, axis=1
)
submission = test_df[["Id", "rating"]]

In [None]:
submission.to_csv('submission3_svd.csv', index=False)