## Recommendation System using Pearson's Correlation


In [None]:
# Skip date
df = pd.read_csv("D:/Full_Data.csv")
df_title = pd.read_csv(
    "./movie_titles.csv",
    encoding="ISO-8859-1",
    header=None,
    names=["Movie_Id", "Year", "Name"],
)
df_title.set_index("Movie_Id", inplace=True)
print("Dataset shape: {}".format(df.shape))


Dataset shape: (17337458, 3)


In [None]:
df.head()


Unnamed: 0,Cust_Id,Rating,Movie_Id
0,712664,5.0,3
1,1331154,4.0,3
2,2632461,3.0,3
3,44937,5.0,3
4,656399,4.0,3


In [None]:
df_title.head()


Unnamed: 0_level_0,Year,Name
Movie_Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003.0,Dinosaur Planet
2,2004.0,Isle of Man TT 2004 Review
3,1997.0,Character
4,1994.0,Paula Abdul's Get Up & Dance
5,2004.0,The Rise and Fall of ECW


In [None]:
f = ["count", "mean"]

df_movie_summary = df.groupby("Movie_Id")["Rating"].agg(f)
df_movie_summary.index = df_movie_summary.index.map(int)


In [None]:
df_p = pd.pivot_table(df, values="Rating", index="Cust_Id", columns="Movie_Id")
df_p.shape


(143458, 1350)

In [None]:
def recommend(movie_title, min_count):
    print("For movie ({})".format(movie_title))
    print("- Top 10 movies recommended based on Pearsons'R correlation - ")
    i = int(df_title.index[df_title["Name"] == movie_title][0])
    target = df_p[i]
    similar_to_target = df_p.corrwith(target)
    corr_target = pd.DataFrame(similar_to_target, columns=["PearsonR"])
    corr_target.dropna(inplace=True)
    corr_target = corr_target.sort_values("PearsonR", ascending=False)
    corr_target.index = corr_target.index.map(int)
    corr_target = corr_target.join(df_title).join(df_movie_summary)[
        ["PearsonR", "Name", "count", "mean"]
    ]
    print(corr_target[corr_target["count"] > min_count][:10].to_string(index=False))


In [None]:
recommend("X2: X-Men United", 0)


For movie (X2: X-Men United)
- Top 10 movies recommended based on Pearsons'R correlation - 
 PearsonR                                                  Name  count     mean
 1.000000                                      X2: X-Men United  73684 3.929157
 0.384550                              Batman Beyond: The Movie   2275 3.730549
 0.375967                                        Justice League   3049 3.692030
 0.361393                      Justice League: Justice on Trial   2499 3.705482
 0.338025                    Batman Beyond: Return of the Joker   3250 3.590769
 0.335256                                         Batman Begins  37558 4.244315
 0.328229                          Batman: Mask of the Phantasm   2523 3.753468
 0.327040 Batman: The Animated Series: Tales of the Dark Knight   2179 3.581000
 0.316666                       Dragon Ball Z: Super Android 13   1859 3.348037
 0.316166                              Mortal Kombat: The Movie   6984 3.150200


In [None]:
recommend("The Amityville Horror", 0)


For movie (The Amityville Horror)
- Top 10 movies recommended based on Pearsons'R correlation - 
 PearsonR                                  Name  count     mean
 1.000000                 The Amityville Horror  10492 3.448246
 0.486752          Omen III: The Final Conflict   2779 3.170925
 0.481592                    Holiday in the Sun   1468 3.297003
 0.470641                  Children of the Corn  12498 3.212834
 0.470275                                  Cujo  10676 3.296834
 0.459427                       Damien: Omen II   4715 3.365005
 0.454038 Children of the Corn 4: The Gathering   1863 2.334944
 0.453149                             Inu-Yasha   1042 4.457774
 0.445397                          Jo Jo Dancer   2061 2.961184
 0.443583           Candyman 3: Day of the Dead   1734 2.839100


## Recommendation System using Singular Value Decomposition (SVD)


In [None]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
import time
import warnings

warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv("./Netflix-Prize-Dataset.csv")
# df = pd.read_csv("./Above-4-Rating.csv")
df_title = pd.read_csv(
    "./movie_titles.csv",
    encoding="ISO-8859-1",
    header=None,
    names=["Movie_Id", "Year", "Name"],
)
df_title.set_index("Movie_Id", inplace=True)
print("Dataset shape: {}".format(df.shape))
df.info()


Dataset shape: (9671171, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9671171 entries, 0 to 9671170
Data columns (total 3 columns):
 #   Column    Dtype  
---  ------    -----  
 0   Cust_Id   int64  
 1   Rating    float64
 2   Movie_Id  int64  
dtypes: float64(1), int64(2)
memory usage: 221.4 MB


In [3]:
f = ["count", "mean"]

df_movie_summary = df.groupby("Movie_Id")["Rating"].agg(f)
df_movie_summary.index = df_movie_summary.index.map(int)
movie_benchmark = round(df_movie_summary["count"].quantile(0.7), 0)
drop_movie_list = df_movie_summary[df_movie_summary["count"] < movie_benchmark].index


Let's pivot the data set and put it into a giant matrix - we need it for our recommendation system:


In [4]:
df_p = pd.pivot_table(df, values="Rating", index="Cust_Id", columns="Movie_Id")
df_p.shape


(143441, 1350)

Below is what user 305344 liked in the past:


In [5]:
df_305344 = df[(df["Cust_Id"] == 305344) & (df["Rating"] == 5)]
df_305344 = df_305344.set_index("Movie_Id")
df_305344 = df_305344.join(df_title)["Name"]
df_305344.head()


Movie_Id
57        Richard III
175    Reservoir Dogs
311           Ed Wood
329             Dogma
331       Chasing Amy
Name: Name, dtype: object

In [6]:
t0 = time.time()

reader = Reader()
data = Dataset.load_from_df(df[["Cust_Id", "Movie_Id", "Rating"]][:], reader)
svd = SVD()
cross_validate(svd, data, measures=["RMSE", "MAE"], verbose=True)

train_time = round(time.time() - t0, 3)
print("\nTime Taken For Running the Algorithm --> ", train_time)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.4409  0.4408  0.4412  0.4408  0.4409  0.4409  0.0002  
MAE (testset)     0.3769  0.3768  0.3769  0.3766  0.3766  0.3768  0.0001  
Fit time          859.58  888.42  861.27  885.42  891.83  877.31  13.94   
Test time         53.88   51.85   50.95   46.31   49.77   50.55   2.51    
Time Taken For Running the Algorithm -->  4828.203


Let's predict which movies user 305344 would love to watch:


In [7]:
user_305344 = df_title.copy()
user_305344 = user_305344.reset_index()
user_305344 = user_305344[~user_305344["Movie_Id"].isin(drop_movie_list)]

# getting full dataset
data = Dataset.load_from_df(df[["Cust_Id", "Movie_Id", "Rating"]], reader)

trainset = data.build_full_trainset()
svd.fit(trainset)

user_305344["Estimate_Score"] = user_305344["Movie_Id"].apply(
    lambda x: svd.predict(305344, x).est
)

user_305344 = user_305344.drop("Movie_Id", axis=1)

user_305344 = user_305344.sort_values("Estimate_Score", ascending=False)
print(user_305344.head(10))


        Year                            Name  Estimate_Score
2113  2002.0                         Firefly        4.745697
1475  2004.0        Six Feet Under: Season 4        4.692134
1255  1994.0     The Best of Friends: Vol. 4        4.677097
2941  1999.0               Friends: Season 6        4.674388
2101  1994.0          The Simpsons: Season 6        4.669048
1494  2001.0                 Alias: Season 1        4.667504
2802  1995.0             Pride and Prejudice        4.655969
269   2001.0      Sex and the City: Season 4        4.645267
4352  2002.0  Curb Your Enthusiasm: Season 3        4.638450
3077  1994.0   The Best of Friends: Season 2        4.635281


In [8]:
user_387418 = df_title.copy()
user_387418 = user_387418.reset_index()
user_387418 = user_387418[~user_387418["Movie_Id"].isin(drop_movie_list)]

# getting full dataset
data = Dataset.load_from_df(df[["Cust_Id", "Movie_Id", "Rating"]], reader)

trainset = data.build_full_trainset()
svd.fit(trainset)

user_387418["Estimate_Score"] = user_387418["Movie_Id"].apply(
    lambda x: svd.predict(387418, x).est
)

user_387418 = user_387418.drop("Movie_Id", axis=1)

user_387418 = user_387418.sort_values("Estimate_Score", ascending=False)
print(user_387418.head(10))


        Year                                           Name  Estimate_Score
1475  2004.0                       Six Feet Under: Season 4        4.762624
2101  1994.0                         The Simpsons: Season 6        4.749856
2113  2002.0                                        Firefly        4.734036
1255  1994.0                    The Best of Friends: Vol. 4        4.711141
2941  1999.0                              Friends: Season 6        4.707899
1494  2001.0                                Alias: Season 1        4.703556
2056  2001.0             Buffy the Vampire Slayer: Season 6        4.702866
269   2001.0                     Sex and the City: Season 4        4.689796
3443  2004.0          Family Guy: Freakin' Sweet Collection        4.673524
2451  2001.0  Lord of the Rings: The Fellowship of the Ring        4.673046


In [9]:
user_2439493 = df_title.copy()
user_2439493 = user_2439493.reset_index()
user_2439493 = user_2439493[~user_2439493["Movie_Id"].isin(drop_movie_list)]

# getting full dataset
data = Dataset.load_from_df(df[["Cust_Id", "Movie_Id", "Rating"]], reader)

trainset = data.build_full_trainset()
svd.fit(trainset)

user_2439493["Estimate_Score"] = user_2439493["Movie_Id"].apply(
    lambda x: svd.predict(2439493, x).est
)

user_2439493 = user_2439493.drop("Movie_Id", axis=1)

user_2439493 = user_2439493.sort_values("Estimate_Score", ascending=False)
print(user_2439493.head(10))


        Year                                   Name  Estimate_Score
2113  2002.0                                Firefly        4.932801
2101  1994.0                 The Simpsons: Season 6        4.884873
1475  2004.0               Six Feet Under: Season 4        4.878296
2941  1999.0                      Friends: Season 6        4.876441
3443  2004.0  Family Guy: Freakin' Sweet Collection        4.861074
2056  2001.0     Buffy the Vampire Slayer: Season 6        4.857031
1255  1994.0            The Best of Friends: Vol. 4        4.849698
269   2001.0             Sex and the City: Season 4        4.846333
1494  2001.0                        Alias: Season 1        4.837559
4352  2002.0         Curb Your Enthusiasm: Season 3        4.825983


In [10]:
df["Cust_Id"].value_counts(ascending=False).head()


1664010    1250
2118461    1205
716173     1052
794999      797
303948      736
Name: Cust_Id, dtype: int64