# Recommendation and clustering project

In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def read_json(filename):
    output_data = []
    for line in open(filename, "r"):
        dict_line = json.loads(line)
        del dict_line['messages']
        output_data.append(dict_line)
    print(">> Loaded {} lines".format(len(output_data)))
    return output_data

In [3]:
!wget https://github.com/ReDialData/website/raw/data/redial_dataset.zip

--2020-05-04 22:49:04--  https://github.com/ReDialData/website/raw/data/redial_dataset.zip
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/ReDialData/website/data/redial_dataset.zip [following]
--2020-05-04 22:49:05--  https://raw.githubusercontent.com/ReDialData/website/data/redial_dataset.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 199.232.36.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|199.232.36.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5765261 (5.5M) [application/zip]
Saving to: ‘redial_dataset.zip.3’


2020-05-04 22:49:06 (5.41 MB/s) - ‘redial_dataset.zip.3’ saved [5765261/5765261]



In [4]:
!unzip -oj redial_dataset.zip

Archive:  redial_dataset.zip
  inflating: movies_with_mentions.csv  
  inflating: test_data.jsonl         
  inflating: train_data.jsonl        


In [5]:
movies_data = read_json("train_data.jsonl")

>> Loaded 10006 lines


In [6]:
movies_mentions_data = pd.read_csv("movies_with_mentions.csv")

In [7]:
movies_mentions_data

Unnamed: 0,movieId,movieName,nbMentions
0,75796,Headhunter (2009),1
1,75815,Angels in the Outfield (1994),6
2,75822,Eddie and the Cruisers (1983),1
3,75828,Ninja Assassin (2009),5
4,75867,Orgazmo (1997),3
...,...,...,...
6919,206079,2036 Origin Unknown (2018),3
6920,206080,The Hatching (2016),4
6921,206085,Hotel Artemis (2018),1
6922,206087,Hereditary (2018),1


In [8]:
movies_df = pd.DataFrame(movies_data)


In [9]:
len(movies_df)

10006

In [10]:
movies = movies_df.head(1000).copy()

In [11]:
movies

Unnamed: 0,movieMentions,respondentQuestions,conversationId,respondentWorkerId,initiatorWorkerId,initiatorQuestions
0,{'203371': 'Final Fantasy: The Spirits Within ...,"{'203371': {'suggested': 1, 'seen': 0, 'liked'...",391,1,0,"{'203371': {'suggested': 1, 'seen': 0, 'liked'..."
1,"{'196336': 'Scarface (1983)', '204322': 'Sout...","{'196336': {'suggested': 0, 'seen': 1, 'liked'...",395,1,2,[]
2,"{'79320': 'Contact (2009)', '84001': 'Donnie ...","{'79320': {'suggested': 0, 'seen': 1, 'liked':...",397,1,3,"{'79320': {'suggested': 0, 'seen': 0, 'liked':..."
3,"{'188302': 'Ever After (1998)', '195904': 'Cha...",[],405,5,4,[]
4,"{'119144': 'Wonder Woman (2017)', '125431': '...","{'119144': {'suggested': 0, 'seen': 1, 'liked'...",407,6,7,"{'119144': {'suggested': 0, 'seen': 1, 'liked'..."
...,...,...,...,...,...,...
995,"{'183056': 'The Big Lebowski', '177112': 'Wedd...","{'183056': {'suggested': 1, 'seen': 1, 'liked'...",2804,37,171,"{'183056': {'suggested': 1, 'seen': 1, 'liked'..."
996,"{'185139': 'The Notebook (2004)', '204604': 'B...","{'185139': {'suggested': 1, 'seen': 1, 'liked'...",2805,167,191,"{'185139': {'suggested': 0, 'seen': 1, 'liked'..."
997,"{'78670': 'They Live (1988)', '176641': 'The O...","{'78670': {'suggested': 0, 'seen': 1, 'liked':...",2806,171,37,"{'78670': {'suggested': 0, 'seen': 1, 'liked':..."
998,{'96281': 'Star Wars: Episode VIII – The Last ...,"{'96281': {'suggested': 1, 'seen': 0, 'liked':...",2807,37,171,"{'96281': {'suggested': 1, 'seen': 0, 'liked':..."


In [12]:
def remove_empty_entries(df, column):    
    df_out = df[df[column].map(lambda d: len(d)) > 0].copy()
    return df_out

In [13]:
movies = remove_empty_entries(remove_empty_entries(movies, 'initiatorQuestions'), 'respondentQuestions')

In [14]:
movies

Unnamed: 0,movieMentions,respondentQuestions,conversationId,respondentWorkerId,initiatorWorkerId,initiatorQuestions
0,{'203371': 'Final Fantasy: The Spirits Within ...,"{'203371': {'suggested': 1, 'seen': 0, 'liked'...",391,1,0,"{'203371': {'suggested': 1, 'seen': 0, 'liked'..."
2,"{'79320': 'Contact (2009)', '84001': 'Donnie ...","{'79320': {'suggested': 0, 'seen': 1, 'liked':...",397,1,3,"{'79320': {'suggested': 0, 'seen': 0, 'liked':..."
4,"{'119144': 'Wonder Woman (2017)', '125431': '...","{'119144': {'suggested': 0, 'seen': 1, 'liked'...",407,6,7,"{'119144': {'suggested': 0, 'seen': 1, 'liked'..."
5,"{'204334': 'Happy Death Day (2017)', '166736'...","{'204334': {'suggested': 0, 'seen': 0, 'liked'...",411,6,7,"{'204334': {'suggested': 1, 'seen': 1, 'liked'..."
6,"{'106388': 'The Princess Bride (1987)', '1191...","{'106388': {'suggested': 0, 'seen': 0, 'liked'...",415,6,8,"{'106388': {'suggested': 0, 'seen': 1, 'liked'..."
...,...,...,...,...,...,...
995,"{'183056': 'The Big Lebowski', '177112': 'Wedd...","{'183056': {'suggested': 1, 'seen': 1, 'liked'...",2804,37,171,"{'183056': {'suggested': 1, 'seen': 1, 'liked'..."
996,"{'185139': 'The Notebook (2004)', '204604': 'B...","{'185139': {'suggested': 1, 'seen': 1, 'liked'...",2805,167,191,"{'185139': {'suggested': 0, 'seen': 1, 'liked'..."
997,"{'78670': 'They Live (1988)', '176641': 'The O...","{'78670': {'suggested': 0, 'seen': 1, 'liked':...",2806,171,37,"{'78670': {'suggested': 0, 'seen': 1, 'liked':..."
998,{'96281': 'Star Wars: Episode VIII – The Last ...,"{'96281': {'suggested': 1, 'seen': 0, 'liked':...",2807,37,171,"{'96281': {'suggested': 1, 'seen': 0, 'liked':..."


In [15]:
movies_cp = pd.DataFrame(columns=["movie_id", "cid", "rid", "rsuggested", "rseen", "rliked", "iid", "isuggested", "iseen", "iliked"])
movies_list = []
movies_title = dict()
for index, row in movies.iterrows():
    for mid, mtitle in row['movieMentions'].items():
        movies_title.update({mid: mtitle})
        
    for (rmovie_id, rq), (imovie_id, iq) in zip(sorted(row['respondentQuestions'].items()),
                                                sorted(row['initiatorQuestions'].items())):
        if rmovie_id != imovie_id :
            print(">> Wrong row {}".format(index))
            break
        movies_list.append({"movie_id": rmovie_id, "cid": row['conversationId'], 
                            "rid": row['respondentWorkerId'], "rsuggested": rq['suggested'], "rseen": rq['seen'], "rliked": rq['liked'],
                            "iid": row['initiatorWorkerId'], "isuggested": iq['suggested'], "iseen": iq['seen'], "iliked": iq['liked']})


>> Wrong row 84
>> Wrong row 361
>> Wrong row 374
>> Wrong row 382
>> Wrong row 554
>> Wrong row 740
>> Wrong row 810
>> Wrong row 873
>> Wrong row 999


In [16]:
movies_titles = []

In [17]:
movies_titles = pd.DataFrame.from_dict(movies_title, orient='index')

In [18]:
movies_titles.rename(columns={0: 'title'}, inplace=True)

In [19]:
len(movies_titles)

1628

In [20]:
movies_cp = movies_cp.append(movies_list)

In [21]:
movies_cp.describe()

Unnamed: 0,movie_id,cid,rid,rsuggested,rseen,rliked,iid,isuggested,iseen,iliked
count,5353,5353,5353,5353,5353,5353,5353,5353,5353,5353
unique,1620,969,130,2,2,2,116,2,2,2
top,187028,947,56,1,1,1,56,1,1,1
freq,68,14,556,3514,3934,4958,821,3535,3763,4836


In [22]:
movies_cp[movies_cp['movie_id']==movies_titles.index[0]]

Unnamed: 0,movie_id,cid,rid,rsuggested,rseen,rliked,iid,isuggested,iseen,iliked
4,203371,391,1,1,0,1,0,1,0,1
313,203371,583,14,0,1,1,21,0,1,1
617,203371,766,20,1,1,1,21,1,0,1
755,203371,809,20,1,1,1,21,1,0,1
1732,203371,1269,87,1,0,1,86,1,0,1


In [23]:
movies_table = movies_cp.groupby('movie_id').sum()

In [24]:
res_table = movies_cp.groupby('rid').sum().drop(['movie_id', 'cid'], axis=1).sort_values(by=['rsuggested'], ascending=False)

In [25]:
res_table.describe()

Unnamed: 0,rsuggested,rseen,rliked,iid,isuggested,iseen,iliked
count,130.0,130.0,130.0,130.0,130.0,130.0,130.0
mean,27.030769,30.261538,38.138462,2999.061538,27.192308,28.946154,37.2
std,59.900556,62.420623,79.574036,6018.555456,60.616995,57.451741,76.005141
min,1.0,1.0,1.0,11.0,0.0,1.0,1.0
25%,3.0,4.0,4.25,322.0,3.0,4.0,5.0
50%,6.0,6.5,7.0,798.0,6.0,7.0,8.0
75%,19.5,25.0,30.0,2756.5,20.0,23.75,29.0
max,405.0,462.0,533.0,40765.0,423.0,386.0,509.0


In [26]:
ini_table = movies_cp.groupby('iid').sum().drop(['movie_id', 'cid'], axis=1).sort_values(by=['isuggested'], ascending=False)

In [27]:
ini_table.describe()

Unnamed: 0,rid,rsuggested,rseen,rliked,isuggested,iseen,iliked
count,116.0,116.0,116.0,116.0,116.0,116.0,116.0
mean,3304.603448,30.293103,33.913793,42.741379,30.474138,32.439655,41.689655
std,7367.530585,63.63524,83.083883,96.402789,63.165409,82.847612,93.596304
min,6.0,1.0,0.0,2.0,1.0,0.0,1.0
25%,320.0,4.0,3.0,5.0,4.0,3.0,4.75
50%,852.0,5.5,6.5,8.0,5.0,6.5,8.0
75%,3079.75,25.5,26.0,30.5,23.0,24.25,30.5
max,58846.0,445.0,687.0,762.0,434.0,704.0,715.0


In [28]:
ini_table = ini_table[ini_table['isuggested']<= ini_table['isuggested'].mean()+ini_table['isuggested'].std()].copy()

In [29]:
res_table = res_table[res_table['rsuggested']<= res_table['rsuggested'].mean()+res_table['rsuggested'].std()].copy()

In [30]:
len(res_table)

119

In [31]:
len(ini_table)

107

In [32]:
# uncomment these two
#! wget http://files.grouplens.org/datasets/movielens/ml-latest.zip

In [33]:
#! unzip -oj ml-latest.zip

In [34]:
movies_df = pd.read_csv('movies.csv')

In [35]:
ratings_df = pd.read_csv('ratings.csv')

In [36]:
movies_df['genres'] = movies_df.genres.str.split('|')

In [37]:
movies_genres_df = movies_df.copy()

for index, row in movies_df.iterrows():
    for genre in row['genres']:
        movies_genres_df.at[index, genre] = 1
movies_genres_df = movies_genres_df.fillna(0).drop('genres', 1)
movies_genres_df.rename(columns={'movieId': 'movielens_id'}, inplace=True)
movies_genres_df.head()

Unnamed: 0,movielens_id,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story (1995),1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
movies_titles

Unnamed: 0,title
203371,Final Fantasy: The Spirits Within (2001)
84779,The Triplets of Belleville (2003)
122159,Mary and Max (2009)
151313,A Scanner Darkly (2006)
191602,Waking Life (2001)
...,...
190661,Big Trouble in Little China (1986)
151102,Men in Black (1997)
178606,Bright Angel (1990)
83586,The Amityville Horror (1979)


In [39]:
%%time
common_movies_df = pd.merge(movies_titles.reset_index(level=0), movies_genres_df, on=['title'], how='inner')
common_movies_df.rename(columns={'index': 'movie_id'}, inplace=True)
common_movies_df.head()

CPU times: user 30.1 ms, sys: 580 µs, total: 30.7 ms
Wall time: 31 ms


Unnamed: 0,movie_id,title,movielens_id,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,203371,Final Fantasy: The Spirits Within (2001),4446,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,122159,Mary and Max (2009),71899,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,191602,Waking Life (2001),4873,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,165710,The Boss Baby (2017),168418,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,84001,Donnie Darko (2001),4878,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
len(common_movies_df)

508

In [41]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [42]:
ratings_id_df = ratings_df.groupby('movieId').mean()

In [43]:
ratings_id_df = ratings_id_df.reset_index(level=0).rename(columns={'movieId': 'movielens_id'})

In [44]:
ratings_id_df.drop(['userId', 'timestamp'], axis=1, inplace=True)

In [45]:
ratings_id_df

Unnamed: 0,movielens_id,rating
0,1,3.886649
1,2,3.246583
2,3,3.173981
3,4,2.874540
4,5,3.077291
...,...,...
53884,193876,3.000000
53885,193878,2.000000
53886,193880,2.000000
53887,193882,2.000000


In [46]:
common_movies_df = pd.merge(common_movies_df, ratings_id_df,
                            on=['movielens_id'], how='inner')
common_movies_df.head()

Unnamed: 0,movie_id,title,movielens_id,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed),rating
0,203371,Final Fantasy: The Spirits Within (2001),4446,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.096147
1,122159,Mary and Max (2009),71899,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.93669
2,191602,Waking Life (2001),4873,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.84552
3,165710,The Boss Baby (2017),168418,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.118729
4,84001,Donnie Darko (2001),4878,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.962736


In [47]:
common_movies_df.describe()

Unnamed: 0,movielens_id,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed),rating
count,507.0,507.0,507.0,507.0,507.0,507.0,507.0,507.0,507.0,507.0,...,507.0,507.0,507.0,507.0,507.0,507.0,507.0,507.0,507.0,507.0
mean,52755.637081,0.153846,0.057199,0.074951,0.495069,0.102564,0.183432,0.327416,0.240631,0.142012,...,0.071006,0.13215,0.021696,0.027613,0.029586,0.027613,0.019724,0.003945,0.003945,3.320066
std,61419.224547,0.361158,0.232452,0.263572,0.500469,0.303688,0.387402,0.469734,0.427889,0.349407,...,0.257088,0.338988,0.145834,0.164024,0.169609,0.164024,0.139187,0.062745,0.062745,0.48348
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.416667
25%,2760.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.038873
50%,7451.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.352941
75%,97357.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.690675
max,183295.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.257502
