# Data Exploration
It is time to explore and build the final dataset for machine learning.

## Profiles
We can start with the profiles dataset.

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

In [2]:
df = pd.read_csv('../data/profiles_clean.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,profile,gender,favorites_anime
0,DesolatePsyche,Male,"['33352', '25013', '5530', '33674', '1482', '2..."
1,baekbeans,Female,"['11061', '31964', '853', '20583', '918', '925..."
2,skrn,,"['918', '2904', '11741', '17074', '23273', '32..."
3,edgewalker00,Male,"['5680', '849', '2904', '3588', '37349']"
4,aManOfCulture99,Male,"['4181', '7791', '9617', '5680', '2167', '4382..."


Right now, the `favorites_anime` feature is a string formatted as an array. We will need to convert that into a proper array first before analysis.

In [4]:
import ast
def perfectEval(anonstring):
        try:
            ev = ast.literal_eval(anonstring)
            return ev
        except ValueError:
            corrected = "\'" + anonstring + "\'"
            ev = ast.literal_eval(corrected)
            return ev

This `perfectEval` function will convert the stringified array back into an array, which will be stored for future use.

In [5]:
df['favorites_anime'] = df['favorites_anime'].apply(perfectEval)

## Converting Favorited Anime into a Genre Representation
Having `favorites_anime` in an array format isn't very helpful for analysis. What we need is a count of all the genres a user has favorited. The genre with the highest amount would be the most common genre common among all their favorites.

To extract the genres from all of a user's favorites, we will have to find each favorite for the user and add the individual genres together. The `animes-clean.csv` file already has the genres of anime shows in a one-hot encoding to facilitate this addition.

In [6]:
animes = pd.read_csv('../data/animes_clean.csv')

In [7]:
animes.head()

Unnamed: 0,uid,title,episodes,members,score,Comedy,Action,Fantasy,Adventure,Drama,...,Police,Samurai,Vampire,Thriller,Cars,Josei,Shounen Ai,Shoujo Ai,Yuri,Yaoi
0,9317,Doll Saaya,1.0,609,4.61,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,38339,Suzumi-bune,1.0,137,5.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,39731,Na Bbeun Sang Sa,1.0,149,5.61,0,0,0,0,1,...,0,0,0,1,0,1,0,0,0,0
3,40131,Junjou Juugeki Cosplay Shoujo,1.0,117,3.95,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5569,Tsui no Sora,1.0,1821,2.84,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
type(animes.iloc[0]['uid'])

numpy.int64

In [9]:
type(df.iloc[0]['favorites_anime'][0])

str

The uid in animes is an integer, while the uid in favorites_anime is a string. Convert the animes dataframe's uid to string so the data can be concatenated.

In [10]:
animes['uid'] = animes['uid'].astype('str')

Load the genres column names saved earlier in the animes-cleaning notebook.

In [33]:
import pickle
genres = pickle.load(open('../data/genres.pickle', 'rb'))

In [34]:
genres

['Comedy',
 'Action',
 'Fantasy',
 'Adventure',
 'Drama',
 'Sci-Fi',
 'Hentai',
 'Kids',
 'Shounen',
 'Romance',
 'Slice of Life',
 'Music',
 'School',
 'Supernatural',
 'Historical',
 'Mecha',
 'Magic',
 'Seinen',
 'Mystery',
 'Sports',
 'Ecchi',
 'Shoujo',
 'Super Power',
 'Parody',
 'Military',
 'Demons',
 'Space',
 'Horror',
 'Harem',
 'Dementia',
 'Martial Arts',
 'Psychological',
 'Game',
 'Police',
 'Samurai',
 'Vampire',
 'Thriller',
 'Cars',
 'Josei',
 'Shounen Ai',
 'Shoujo Ai',
 'Yuri',
 'Yaoi']

In [35]:
genre_total_cols = genres.copy()
genre_total_cols.insert(0, 'profile')
genre_total = pd.DataFrame(columns=genre_total_cols)

In [36]:
genre_total

Unnamed: 0,profile,Comedy,Action,Fantasy,Adventure,Drama,Sci-Fi,Hentai,Kids,Shounen,...,Police,Samurai,Vampire,Thriller,Cars,Josei,Shounen Ai,Shoujo Ai,Yuri,Yaoi


In [57]:
# Search for each of a user's favorites by uid and add up all the genres of their favorites.
# Genres are represented in a one-hot encoding
# Function must take in a favorites array with at least 1 favorite
# Favorites array is an array of anime uid's
def genre_sum(profile_id, favorites):
    # print(favorites)
    row_sum = pd.DataFrame()
    for uid in favorites:
        data = animes[animes['uid'] == uid].drop(['uid', 'title', 'episodes', 'members', 'score'], axis=1)
        # instantiate the first favorite when the row_sum is empty
        if len(row_sum) < 1:
            row_sum = data
        else:
            row_sum = row_sum.append(data)
    row_sum = row_sum.sum().to_frame().T
    row_sum.insert(0, 'profile', profile_id)
    return row_sum
    # instantiate the genre_total during the very first iteration
    # if len(genre_total < 1):
    #     genre_total = row_sum.copy()
    # else:
    #     genre_total.append(row_sum)

In [58]:
test = df.iloc[0:10]
# sum = genre_sum(test['profile'], test['favorites_anime'])

In [59]:
test

Unnamed: 0,profile,gender,favorites_anime
0,DesolatePsyche,Male,"[33352, 25013, 5530, 33674, 1482, 269, 18245, ..."
1,baekbeans,Female,"[11061, 31964, 853, 20583, 918, 9253, 34599, 3..."
2,skrn,,"[918, 2904, 11741, 17074, 23273, 32281, 9989, ..."
3,edgewalker00,Male,"[5680, 849, 2904, 3588, 37349]"
4,aManOfCulture99,Male,"[4181, 7791, 9617, 5680, 2167, 4382, 849, 235,..."
5,eneri,,"[5114, 4898, 2904, 1575, 1482]"
6,Waffle_Empress,,"[338, 322, 440, 199, 28223, 12815, 2800, 18679..."
7,NIGGER_BONER,Male,"[11061, 30, 6594, 28701, 10087, 6746, 918, 153..."
8,jchang,Male,"[846, 2904, 5114, 2924, 72]"
9,shadowsplat,,[]


In [62]:
genre_total = pd.DataFrame(columns=genre_total_cols)
for index, item in df.iterrows():
    profile_id = item['profile']
    favorites = item['favorites_anime']
    sum = genre_sum(profile_id, favorites)
    genre_total = genre_total.append(sum)

In [71]:
genre_total.dropna(inplace=True)

In [72]:
genre_total

Unnamed: 0,profile,Comedy,Action,Fantasy,Adventure,Drama,Sci-Fi,Hentai,Kids,Shounen,...,Police,Samurai,Vampire,Thriller,Cars,Josei,Shounen Ai,Shoujo Ai,Yuri,Yaoi
0,DesolatePsyche,8,8,5,7,9,2,0,0,5,...,0,0,1,0,0,1,0,0,0,0
0,baekbeans,6,5,3,2,2,4,0,0,5,...,1,1,0,1,0,0,0,0,0,0
0,skrn,3,3,3,1,5,3,0,0,2,...,0,1,1,1,0,0,0,0,0,0
0,edgewalker00,3,3,2,1,1,2,0,0,1,...,0,0,0,0,0,0,0,0,0,0
0,aManOfCulture99,7,0,0,0,4,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,lovelessxd,2,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
0,Shattered_Angel,5,3,3,3,5,1,0,0,2,...,0,0,2,2,0,1,0,0,0,0
0,FluffyWalrus,2,4,2,4,3,3,0,0,2,...,1,1,0,0,0,0,0,0,0,0
0,camco,3,1,1,1,1,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [79]:
genre_total = genre_total.reset_index(drop=True)

In [81]:
genre_total.drop(['index'], axis=1)

Unnamed: 0,profile,Comedy,Action,Fantasy,Adventure,Drama,Sci-Fi,Hentai,Kids,Shounen,...,Police,Samurai,Vampire,Thriller,Cars,Josei,Shounen Ai,Shoujo Ai,Yuri,Yaoi
0,DesolatePsyche,8,8,5,7,9,2,0,0,5,...,0,0,1,0,0,1,0,0,0,0
1,baekbeans,6,5,3,2,2,4,0,0,5,...,1,1,0,1,0,0,0,0,0,0
2,skrn,3,3,3,1,5,3,0,0,2,...,0,1,1,1,0,0,0,0,0,0
3,edgewalker00,3,3,2,1,1,2,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,aManOfCulture99,7,0,0,0,4,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65120,lovelessxd,2,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
65121,Shattered_Angel,5,3,3,3,5,1,0,0,2,...,0,0,2,2,0,1,0,0,0,0
65122,FluffyWalrus,2,4,2,4,3,3,0,0,2,...,1,1,0,0,0,0,0,0,0,0
65123,camco,3,1,1,1,1,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [82]:
genre_total.to_csv('../data/dataset.csv')