In [1]:
import re
import pandas as pd
import numpy as np
import datetime as dt
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.model_selection import cross_val_score

In [2]:
# Load saved data
with open('/home/alex/data/project_2_datard3.pkl', 'rb') as picklefile:
    data = pickle.load(picklefile)
with open('/home/alex/data/project_2_datard4.pkl', 'rb') as picklefile:
    data2 = pickle.load(picklefile)         
with open('/home/alex/data/project_2_datard5.pkl', 'rb') as picklefile:
    data3 = pickle.load(picklefile)         

In [3]:
df = pd.concat([pd.DataFrame(data), pd.DataFrame(data2), pd.DataFrame(data3)])
# drop observations that have the same release ID
df.drop_duplicates(subset = 'id',inplace=True)

In [4]:
df = df.set_index(['release', 'id'])

#### Impute missing data

In [5]:
# impute missing release dates and ratings with mean value
df['release_date'] = df['release_date'].fillna(1996)
df['rating'] = df['rating'].fillna(4.1)

df['style'] = df['style'].fillna('')
df['ask_price'] = df['ask_price'].fillna(0)

#### Convert Dates to numbers

In [6]:
# convert dates to numbers
df.release_date = df.release_date.apply(lambda x: int(x))

df.last_sold = df.last_sold.apply(lambda x: dt.datetime.strptime(x, '%d %b %y'))
df.last_sold = df.last_sold.apply(lambda x: x.toordinal())

#### Drop features that are high cardinality

In [7]:
df.drop(labels=['artist','label','format_details'], axis=1, inplace=True)

#### Bin other categorical features and dummify

In [8]:

formatcounts = df['format'].value_counts()
otherformats = list(formatcounts[formatcounts<100].index)
df['format'] = df['format'].replace(otherformats, 'Other')

countrycounts = df.country.value_counts()
othercountries = list(countrycounts[countrycounts<100].index)
df.country = df.country.replace(othercountries, 'Other')

In [9]:
df = pd.get_dummies(df, columns = ['format','country'])

#### Save as base case data

In [10]:
with open('/home/alex/data/project_2_basecase.pkl', 'wb') as picklefile:
                pickle.dump(df, picklefile)

#### Create encoding for genre with increasing weights

In [11]:
df2=df.copy()
#create genre tags
# since we are splitting by comma, genres with commas in them need to be changed
df2['genre']=df2['genre'].replace('Folk, World, & Country', 'FolkWorldCountry', regex=True)
# extract unique genres from genre series
genreset = set()
for row in df2['genre']:
    for item in row.split(','):
        genreset.add(item.strip())
#create new features for each unique genre
for i in genreset:
    df2[i]=0
# each genre gets encoded according to its order in the list: first_genre=1, second=2, etc.
for row in df2.itertuples():
    for idx, item in enumerate(row.genre.split(',')):
        df2.at[row.Index, item.strip()] = idx+1        

In [12]:
# no need for this column anymore
df2.drop('genre',axis=1, inplace=True)

#### Drop price data

In [13]:
df2.drop(['ask_price','highest','lowest'], axis=1, inplace=True)

#### Drop Outliers

In [14]:
df2 = df2[(df2['median']<200)&(df2['median']>0.15)]

In [15]:
with open('/home/alex/data/project_2_genretags.pkl', 'wb') as picklefile:
                pickle.dump(df2, picklefile)

#### Create encoding with increasing weights for style

In [16]:
df5 = df2.copy()
styleset = defaultdict(int)
# Impute missing style data
df5['style'] = df5['style'].fillna('None')
# generate list of unique styles
for row in df5['style']:
    for item in row.split(','):
        styleset[item.strip()]+=1
# sort styels by count and keep the top fifty
sortedstyles = []
for item in styleset:
    sortedstyles.append((styleset[item], item))
popularstyles = []
for i in sorted(sortedstyles, reverse=True)[:50]:
    popularstyles.append(i[1])
# Create columns for each style.
for i in popularstyles:
    # Need to check if the style is also a genre--would overwrite 
    # the above-defined genre weights. 
    if i not in genreset:
        df5[i] = 0
df5['Other_Style'] = 0
# add encoding
for row in df5.itertuples():
    for idx, item in enumerate(row.style.split(',')):
        # If it's not in the top 50 styles, add to other
        if item.strip() not in popularstyles: 
            # if more than one style is in 'other', use highest index
            if df5.at[row.Index, 'Other_Style'].any() == 0:
                df5.at[row.Index, 'Other_Style'] = idx+1
        # skip styles that are also genres
        elif item.strip() in genreset:
            continue
        else:
            df5.at[row.Index, item.strip()] = idx+1


In [17]:
# no need for this column anymore
df5.drop('style',axis=1, inplace=True)

In [18]:
with open('/home/alex/data/project_2_styletags.pkl', 'wb') as picklefile:
                pickle.dump(df5, picklefile)