### Check data usability

In [2]:
from graphsage.utils import load_data
import pandas as pd
import numpy as np

### Target
Generate movie features
- `movie` ID, title, produced year, genre (`label`)
- `user` each line is user and movies he or she rated
- `feature` each line is index + feature `index, x1, x2, x3, ...`

Two kinds of data:
- `before_YY` for training, before year `YY`.
- `year_YY` for testing, movie in year `YY`.

#### 1. `movie`

In [3]:
movies = pd.read_csv('mv2/movies.dat', sep='\t', engine='python', header=0)
movies = movies[['id', 'title', 'year']]
movies.shape

(10197, 3)

In [13]:
print(sorted(movies.year.unique()))

[1903, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011]


In [4]:
genre = pd.read_csv('mv2/movie_genres.dat', sep='\t', engine='python', header=0)
genre.head()

Unnamed: 0,movieID,genre
0,1,Adventure
1,1,Animation
2,1,Children
3,1,Comedy
4,1,Fantasy


In [5]:
# get all unique genres
genList = genre.genre.unique()
genList

array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Mystery', 'Sci-Fi', 'IMAX', 'Documentary', 'War', 'Musical',
       'Film-Noir', 'Western', 'Short'], dtype=object)

In [21]:
mv_genre = genre.groupby('movieID')['genre'].apply(list).reset_index(name='genres')
mv_genre.head()

Unnamed: 0,movieID,genres
0,1,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,"[Adventure, Children, Fantasy]"
2,3,"[Comedy, Romance]"
3,4,"[Comedy, Drama, Romance]"
4,5,[Comedy]


In [23]:
mv_genre.shape

(10197, 2)

In [31]:
gen_dict = {g:i for i, g in enumerate(genList)}
mv_genre['label'] = mv_genre.genres.apply(lambda x: np.array([gen_dict[g] for g in x]))
mv_genre.head()

Unnamed: 0,movieID,genres,label
0,1,"[Adventure, Animation, Children, Comedy, Fantasy]","[0, 1, 2, 3, 4]"
1,2,"[Adventure, Children, Fantasy]","[0, 2, 4]"
2,3,"[Comedy, Romance]","[3, 5]"
3,4,"[Comedy, Drama, Romance]","[3, 6, 5]"
4,5,[Comedy],[3]


In [33]:
movies['label'] = mv_genre['label']
movies.to_csv('mv2-clean/movies.csv', index=False)  # np.array, won't contain "", but has []

In [6]:
node_dict = {mv:i for i, mv in enumerate(movies.id.values)}

#### 2. `user`

In [8]:
user = pd.read_csv('mv2/user_ratedmovies.dat', sep='\t', engine='python', header=0)
print(user.shape)
user.head()

(855598, 9)


Unnamed: 0,userID,movieID,rating,date_day,date_month,date_year,date_hour,date_minute,date_second
0,75,3,1.0,29,10,2006,23,17,16
1,75,32,4.5,29,10,2006,23,23,44
2,75,110,4.0,29,10,2006,23,30,8
3,75,160,2.0,29,10,2006,23,16,52
4,75,163,4.0,29,10,2006,23,29,30


In [38]:
user.movieID.unique().shape # about 90 single nodes

(10109,)

In [9]:
user = user[['userID', 'movieID', 'date_year']]
user.head()

Unnamed: 0,userID,movieID,date_year
0,75,3,2006
1,75,32,2006
2,75,110,2006
3,75,160,2006
4,75,163,2006


In [10]:
user = user.groupby(['userID', 
                     'date_year'])['movieID'].apply(np.array).reset_index(name='movies')
user.head()

Unnamed: 0,userID,date_year,movies
0,75,2006,"[3, 32, 110, 160, 163, 165, 173, 296, 353, 420..."
1,78,2004,"[17, 32, 41, 82, 101, 110, 150, 162, 198, 223,..."
2,78,2007,"[29, 111, 741, 1175, 1306, 3070, 4119, 5445, 5..."
3,127,2007,"[1707, 1911, 2013, 2136, 2719, 3157, 5523, 601..."
4,170,2006,"[1, 2, 10, 19, 21, 32, 34, 39, 47, 50, 104, 11..."


In [77]:
sorted(user.date_year.unique())

[1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009]

In [45]:
user.to_csv('mv2-clean/user.csv', index=False)

In [15]:
mv_count = np.zeros([len(node_dict), len(node_dict)])

In [16]:
from collections import defaultdict
year = 2009
adj = defaultdict(set)

for _, row in user[user.date_year < year].iterrows():
    for mvx in row.movies:
        try:
            mv1 = node_dict[mvx]
            for mvy in row.movies:
                if mvy == mvx:
                    pass
                mv2 = node_dict[mvy]
                mv_count[mv1, mv2] += 1
                mv_count[mv2, mv1] += 1
                if mv_count[mv1, mv2] == 1:
                    adj[mv1].add(mv2)
                if mv_count[mv2, mv1] == 1:
                    adj[mv2].add(mv1)
        except:
            pass
        
edges = [len(i[1]) for i in adj.items()]
sum(edges)

27387370

In [17]:
# count == 0
sum(mv_count[mv_count > 4].astype(int))
# for x in range(len(node_dict)):
#     for y in range(len(node_dict)):
#         if mv_count[x, y] >4

511553466

In [39]:
a = mv_count[mv_count > 199]
a.shape

(409190,)

In [46]:
b = np.copy(mv_count)
print(b.shape)
b[b < 2] = 0
c = [sum(x) for x in b]
c = np.array(c)
print(c.shape)
c[c==0].shape

(10197, 10197)
(10197,)


(2961,)

In [47]:
b = np.copy(mv_count)
print(b.shape)
b[b < 1] = 0
c = [sum(x) for x in b]
c = np.array(c)
print(c.shape)
c[c==0].shape

(10197, 10197)
(10197,)


(2961,)

#### Features

In [53]:
features = pd.read_csv('mv2/movie_actors.dat', sep='\t', engine='python', header=0)
actors = features.groupby(['actorID']).count()
actor_list = actors[actors.movieID > 9].index.values  # get frequent actor list

In [52]:
features = features.groupby(['movieID'])['actorID'].apply(np.array).reset_index(name='actors')
print(features.shape)
features.head()

(10174, 2)


Unnamed: 0,movieID,actors
0,1,"[annie_potts, bill_farmer, don_rickles, erik_v..."
1,2,"[1135379-peter_bryant, adam_hannbyrd, bebe_neu..."
2,3,"[annmargret, buck_henry, buffy_sedlachek, burg..."
3,4,"[1026174-leon, angela_bassett, brandon-hammond..."
4,5,"[ann-walker, annie_meyers_shyer, april_ortiz, ..."


In [63]:
country = pd.read_csv('mv2/movie_countries.dat', sep='\t', engine='python', header=0)