In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

### Target
Generate movie features
- `mv.edge` each line is `x, y` represents an edge
- `mv.feature` each line is index + feature `index, x1, x2, x3, ...`
- `mv.label` each line is index + label list, 1 for has 0 for not

Two kinds of data:
- `before_YY` for training, before year `YY`.
- `year_YY` for testing, movie in year `YY`.

### Source
- `movies.dat    MovieID::Title::Genres`.
- `ratings.dat    UserID::MovieID::Rating::Timestamp`.
- `tags.dat    UserID::MovieID::Tag::Timestamp`.

In [125]:
tags = pd.read_csv('ml-10m/tags.dat', sep='::', engine='python', header=None,
                   names=["UserID", "MovieID", "Tag", "Timestamp"])

In [5]:
movies = pd.read_csv('ml-10m/movies.dat', sep='::', engine='python',
                         header=None,
                         names=["MovieID", "Title", "Genres"])

### 1. Filter movies according to tags, encode genres and store as one-hot string

In [13]:
movies = movies[movies.MovieID.isin(tags.MovieID.unique())]
movies["Year"] = movies.Title.apply(lambda x: int(x[-5:-1]))
movies["Title"] = movies.Title.apply(lambda x: x[0:-7])

In [58]:
genres = np.array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy', 'Romance',
                       'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'Mystery', 'Sci-Fi',
                       'IMAX','Documentary', 'War', 'Musical', 'Film-Noir', 'Western'])

def encode(glist):
    glist = glist.split('|')
    gen = ''
    for x in genres:
        if x in glist:
            gen += '1'
        else:
            gen += '0'
    return gen

movies['Gencode'] = movies.Genres.apply(encode)

In [59]:
movies.head().Gencode

0    1111100000000000000
1    1010100000000000000
2    0001010000000000000
3    0001011000000000000
4    0001000000000000000
Name: Gencode, dtype: object

In [64]:
movies.to_csv('ml-10m/movies.csv', index=False)

### 2. Create user list of rated movies

In [65]:
rating = pd.read_csv('ml-10m/ratings.dat', sep='::', engine='python', 
                     header=None,names=["UserID", "MovieID", "Rating", "Timestamp"])
rating = rating[rating.MovieID.isin(tags.MovieID.unique())]

In [90]:
def year(timestamp):
    return datetime.fromtimestamp(timestamp).year

rating['Year'] = rating.Timestamp.apply(year)
rate = rating[['UserID', 'Year', 'MovieID']]
user = rating.groupby(['UserID', 'Year'])['MovieID'].apply(list).reset_index(name='Movies')

In [91]:
user.to_csv('ml-10m/users.csv', index=False)

In [93]:
user.head()

Unnamed: 0,UserID,Year,Movies
0,1,1996,"[122, 185, 231, 292, 316, 329, 355, 356, 362, ..."
1,2,1997,"[110, 151, 260, 376, 539, 590, 648, 719, 733, ..."
2,3,2005,"[110, 151, 213, 590, 1148, 1246, 1252, 1276, 1..."
3,3,2006,"[3408, 4535, 5299, 5527, 7155, 8533, 27821, 33..."
4,4,1996,"[21, 34, 39, 110, 150, 153, 161, 165, 208, 231..."


### 3. Encode tags to feature vectors

In [122]:
tags.head()
tags.dropna(inplace=True)

Unnamed: 0,UserID,MovieID,Tag,Timestamp
0,15,4973,excellent!,1215184630
1,20,1747,politics,1188263867
2,20,1747,satire,1188263867
3,20,2424,chick flick 212,1188263835
4,20,2424,hanks,1188263835


In [133]:
features = tags[['MovieID','Tag']].groupby('MovieID')['Tag'].apply(list).reset_index(name='Tags')

In [240]:
tag_count = features['Tags'].apply(pd.Series).stack().value_counts().reset_index(name='tc')
freq_tag = tag_count[tag_count.tc > 4]
freq_tag = freq_tag.rename(columns={'index':'name', 'tc':'counts'})
freq_set = freq_tag.name.tolist()

In [241]:
def embed(tags):
    enc=''
    for tag in freq_set:
        if tag in tags:
            enc += '1'
        else:
            enc += '0'
    return enc

features['features'] = features.Tags.apply(embed)

In [242]:
features[['MovieID', 'features']].to_csv('ml-10m/features.csv', index=False)
# each movie has 3000 features;

### Load data for training

In [30]:
def load_data(year):
    """
    - Year will be used as test set
    - All previous years will be used as training set
    - Return train_adj, train_feat, train_label, test_adj, test_feat, test_label
    """
    users = pd.read_csv('ml-10m/users.csv')
    features = pd.read_csv('ml-10m/features.csv', dtype={'features': 'str'})
    labels = pd.read_csv('ml-10m/movies.csv', dtype={'Gencode': 'str'})

    train_node_map = {}
    test_node_map = {}
    train_label = []
    test_label = []
    trainset = labels[labels.Year < year].MovieID.values
    testset = labels[labels.Year == year].MovieID.values
    train_feature = np.zeros((trainset.shape[0], 3000))
    test_feature = np.zeros((testset.shape[0], 3000))
    train_adj = defaultdict(set)
    test_adj = defaultdict(set)

    # get labels, also construct node maps
    for index, row in enumerate(labels[labels.Year < year].values):
        train_node_map[row[0]] = index
        train_label.append(np.fromiter(map(int, row[4]), dtype=int))
    for index, row in enumerate(labels[labels.Year == year].values):
        test_node_map[row[0]] = index
        test_label.append(np.fromiter(map(int, row[4]), dtype=int))

    # get features
    for _, row in features[features.MovieID.isin(trainset)].iterrows():
        train_feature[train_node_map[row.MovieID]] = list(map(int, row.features))
    for _, row in features[features.MovieID.isin(testset)].iterrows():
        test_feature[test_node_map[row.MovieID]] = list(map(int, row.features))

    # get adj list
    for _, row in users[users.Year < year].iterrows():
        for mvx in row.Movies:
            try:
                mv1 = train_node_map[mvx]
                for mvy in row.Movies:
                    if mvy == mvx:
                        pass
                    mv2 = train_node_map[mvy]
                    train_adj[mv1].add(mv2)
                    train_adj[mv2].add(mv1)
            except:
                pass
    for _, row in users[users.Year == year].iterrows():
        for mvx in row.Movies:
            try:
                mv1 = test_node_map[mvx]
                for mvy in row.Movies:
                    if mvy == mvx:
                        pass
                    mv2 = test_node_map[mvy]
                    test_adj[mv1].add(mv2)
                    test_adj[mv2].add(mv1)
            except:
                pass
    return train_adj, train_feature, train_label, test_adj, test_feature, test_label

In [31]:
train_adj, train_feat, train_label, test_adj, test_feat, test_label = load_data(1996)