In [1]:
#importing the libraries
import numpy as np
import pandas as pd

In [2]:
# load in sub-dfs
#User data
df = pd.read_csv('udata.txt', sep='\t', header=None)
df.columns = ['user_id', 'item_id', 'rating', 'timestamp']

#Users
users = pd.read_csv('uuser.txt', sep='|', header=None)
users.columns = ['user_id', 'age', 'gender', 'occupation', 'zip_code']

#genre data
genre = pd.read_csv('ugenre.txt', sep='|', header=None)
genre.columns = ['genre', 'genre_id']

#occupation data
job = pd.read_csv('uoccupation.txt', sep='|', header=None)
job.columns = ['occupation']
job = job.reset_index().rename(columns={'index': 'job'})

#other items
items = pd.read_csv('uitem.txt', sep='|', header=None, encoding='latin-1')
items.columns = ['movie_id', 'movie_title', 'release_date','video_release_date', 
                 'imdb_url', 'unknown', 'Action', 'Adventure', 'Animation','Children', 
                 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 
                 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci_Fi', 
                 'Thriller', 'War', 'Western']

#Combine/clean data
data = df.merge(
    users, how='left', on = 'user_id').merge(
        job, how = 'left', on='occupation').merge(
            items, how='left', left_on='item_id', right_on='movie_id')
data[data.Western.isna()]
data = data[data.Western.notna()].astype({'Western': 'int'})
data.gender.unique()
data['gender_id'] = np.where(data.gender == 'M', 0, 1)

data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,age,gender,occupation,zip_code,job,movie_id,...,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War,Western,gender_id
0,196,242,3,881250949,49,M,writer,55105,20,242,...,0,0,0,0,0,0,0,0,0,0
1,186,302,3,891717742,39,F,executive,0,6,302,...,1,0,0,1,0,0,1,0,0,1
2,22,377,1,878887116,25,M,writer,40206,20,377,...,0,0,0,0,0,0,0,0,0,0
3,244,51,2,880606923,28,M,technician,80525,19,51,...,0,0,0,0,1,0,0,1,1,0
4,166,346,1,886397596,47,M,educator,55113,3,346,...,0,0,0,0,0,0,0,0,0,0


In [4]:
data_matrix = data.pivot_table(values='rating',columns='movie_title',index='user_id')
# drop movies/columns with more than 200 NaN
data_matrix = data_matrix.dropna(thresh=200, axis=1)
print(data_matrix.shape)

# drop users/rows with more than 10 NaN
data_matrix = data_matrix.dropna(thresh=10, axis=0)
data_matrix.head()

(943, 118)


movie_title,2001: A Space Odyssey (1968),Air Force One (1997),Aladdin (1992),Alien (1979),Aliens (1986),Amadeus (1984),Apocalypse Now (1979),Apollo 13 (1995),Babe (1995),Back to the Future (1985),...,True Lies (1994),"Truth About Cats & Dogs, The (1996)",Twelve Monkeys (1995),Twister (1996),"Usual Suspects, The (1995)",Volcano (1997),When Harry Met Sally... (1989),Willy Wonka and the Chocolate Factory (1971),"Wizard of Oz, The (1939)",Young Frankenstein (1974)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,5.0,5.0,5.0,3.0,4.0,1.0,5.0,...,,5.0,4.0,3.0,5.0,,5.0,4.0,4.0,5.0
2,,4.0,,,,,,,,,...,,4.0,,,,,,,,
3,,2.0,,,,,,,,,...,,,,,,,,,,
4,,5.0,,,,,,,,,...,,,,,,,,,,
5,4.0,,4.0,4.0,3.0,,,,,4.0,...,4.0,,,,,,1.0,3.0,,4.0


In [6]:
print("------------------------------------------------------------------------------")
print("List of 10 recommended movies to a user who has liked '2001: A Space Odyssey'")
print(data_matrix.corr(method='pearson')['2001: A Space Odyssey (1968)'].sort_values(ascending=False).iloc[:10])
print()
print("------------------------------------------------------------------------------")
print("List of 10 movies to NOT recommend a user who liked '2001: A Space Odyssey'")
print(data_matrix.corr(method='pearson')['2001: A Space Odyssey (1968)'].sort_values().iloc[:10])
print()
print("------------------------------------------------------------------------------")

------------------------------------------------------------------------------
List of 10 recommended movies to a user who has liked '2001: A Space Odyssey'
movie_title
2001: A Space Odyssey (1968)          1.000000
Clockwork Orange, A (1971)            0.388071
People vs. Larry Flynt, The (1996)    0.327292
Apocalypse Now (1979)                 0.312847
Godfather, The (1972)                 0.305717
Fargo (1996)                          0.299882
Terminator, The (1984)                0.260066
Brazil (1985)                         0.254415
Graduate, The (1967)                  0.252987
Alien (1979)                          0.248089
Name: 2001: A Space Odyssey (1968), dtype: float64

------------------------------------------------------------------------------
List of 10 movies to NOT recommend a user who liked '2001: A Space Odyssey'
movie_title
Air Force One (1997)          -0.282994
Murder at 1600 (1997)         -0.222437
In & Out (1997)               -0.195379
Saint, The (1997)     