###   Model to Predict Anime Score
The purpose of this project is to practice skills in data analysis and machine learning.
To achieve this, I attempt to predict what a anime's MyAnimeList score will be given certain parameters.

Scenario: As an indie company, can we predict the score our project will get?

In [None]:
#import libraries
import pandas as pd 
import numpy as np
import seaborn as sns 
from matplotlib import pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline
pd.set_option('display.max_columns', 1500)

In [None]:
csv_path = 'E:\AnimeAnalysis\AnimeList.csv'
main_df = pd.read_csv(csv_path)
df_copy = main_df
df_copy.head()

# Data Cleaning

Drop unneeded columns

In [None]:
df_copy.columns

In [None]:
drop_cols = ['title_english', 'title_japanese',
       'title_synonyms', 'image_url', 'status',
       'airing', 'aired_string', 'aired','duration', 'rating',  'background',
       'premiered', 'broadcast', 'related',  'opening_theme', 'ending_theme']

In [None]:
# drop unneeded columns
df_cleanned = df_copy.drop(columns = drop_cols, inplace=False, axis=1)
df_cleanned.head()


In [None]:
df_cleanned = df_cleanned[df_cleanned.score > 0] 

In [None]:
df_cleanned.info()

Matching data types and filling in nulls

In [None]:
#Fill in null rank column with zeros
df_cleanned['rank'] = df_cleanned['rank'].fillna(0)
#Fill in rest of null values with 'unknown'
df_cleanned.fillna('unknown', inplace=True)
df_cleanned.info()

In [None]:
#change col:rank to int 
df_cleanned['rank'] = df_cleanned['rank'].astype('int64')
df_cleanned.info()

In [None]:
#Change text to uppercase
df_cleanned['type'] = df_cleanned['type'].str.upper()
df_cleanned['source'] = df_cleanned['source'].str.upper()
df_cleanned['producer'] = df_cleanned['producer'].str.upper()
df_cleanned['licensor'] = df_cleanned['licensor'].str.upper()
df_cleanned['studio'] = df_cleanned['studio'].str.upper()
df_cleanned['genre'] = df_cleanned['genre'].str.upper()

df_cleanned.head()

### Export to csv for Tableau visualization

In [None]:
#df_cleanned.to_csv('AnimeListClean.csv', index = False)

Plot variables against each for possible correlations

In [None]:
#df_cleaned.corr()
#sns.pairplot(df_cleanned)


Change type and source into numbers (encoding)

In [None]:
type_dummy = pd.get_dummies(df_cleanned['type'], prefix='t')
src_dummy = pd.get_dummies(df_cleanned['source'], prefix='src')

In [None]:
src_dummy.head()


Add into df

In [None]:
df_cleanned = pd.concat([df_cleanned, type_dummy, src_dummy], axis=1)
df_cleanned.columns

In [None]:
df_cleanned.head()

Select feature(X) and dependant variable (y)

In [None]:
y = df_cleanned[['score']]
X = df_cleanned.drop(['anime_id','title','type','source','episodes','score','scored_by','rank','popularity','members','favorites','producer','licensor','studio','genre'], axis=1)

In [None]:
print(y.info())
print(X.info())
df_cleanned.info()

Split data into training and testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [None]:
X_train.info()

Train LR model.

In [None]:
lr_model = LinearRegression().fit(X_train, y_train)

In [None]:
lr_model.score(X_train, y_train)

In [None]:
print("Intercept of the linear equation:", lr_model.intercept_) 
print("\nCOefficients of the equation are:", lr_model.coef_)


Test model

In [None]:
yhat = pd.DataFrame(lr_model.predict(X_test), columns=['Predicted Score'])
yhat.head()


In [None]:
actualScore = y_test.reset_index(drop=True) # Drop the index so that we can concat it, to create new dataframe
df_actual_vs_predicted = pd.concat([actualScore,yhat],axis =1)
df_actual_vs_predicted.T

Scoring the performance of model

In [None]:
#define scoring function
def score_model(y_test, yhat):
    #closer to zero is better
    print('MAE: ', metrics.mean_absolute_error(y_test, yhat))

    # Closer to zero is better
    print('MSE: ', metrics.mean_squared_error(y_test, yhat))

    # Closer to zero is better
    print('RMSE: ', np.sqrt(metrics.mean_squared_error(y_test, yhat)))

    # Closer to one is better
    print('R^2: ', metrics.r2_score(y_test,yhat))

In [None]:
score_model(y_test, yhat)

In [None]:
#plotting the true value vs the predicted value
def plot_test(xtest, yframe):
    for columns in X_test:
        sns.scatterplot(x=X_test[columns], y=yframe['score'] )
        sns.scatterplot(x=X_test[columns]+.05, y=yframe['Predicted Score'] ) # +0.5 Shifts the predicted value to the right for better visual
        plt.legend(['actual','predicted'], loc="lower center")
        plt.title(columns)
        plt.show()

#plot_test(X_test, df_actual_vs_predicted)



Use polynomial features

In [None]:
from sklearn.preprocessing import PolynomialFeatures


In [None]:
poly = PolynomialFeatures(1)

In [None]:
X_poly = poly.fit_transform(X)

In [None]:
Xp_train, Xp_test, yp_train, yp_test = train_test_split(X_poly,y,test_size=0.2)

In [None]:
poly_model = LinearRegression().fit(Xp_train, yp_train)
poly_model.score(Xp_train, yp_train)


In [None]:
yp_hat = pd.DataFrame(poly_model.predict(Xp_test), columns=['Predicted Score'])
yp_hat.head()


In [None]:
p_actualScore = yp_test.reset_index(drop=True) # Drop the index so that we can concat it, to create new dataframe
dfp_actual_vs_predicted = pd.concat([p_actualScore,yp_hat],axis =1)
dfp_actual_vs_predicted.T


In [None]:
score_model(yp_test, yp_hat)


In [None]:
#plot_test(Xp_test, dfp_actual_vs_predicted)

Can we use Genre, Producer, etc to predict the score?

In [None]:
df_cleanned.head()

In [None]:
#build a set of genre, this removes duplicates
def get_vals(colname, df=df_cleanned):
    elements = set()
    for name, value in df[colname].iteritems():
        value = value.split(', ')
        elements.update(value)
    return elements
    
   


In [None]:
genres = get_vals('genre')
#print(genres)
producers, licensors, studios = get_vals('producer'), get_vals('licensor'), get_vals('studio')	
#print(producers)
#print(licensors)
#print(studios)



In [None]:
#function to add genres as columns
def add_cols(list, prefix, df=df_cleanned):
    for i in list:
        df[prefix + '_' + i] = 0 #fills column cells with zeros
        df[prefix + '_' + i] = df[prefix + '_' + i].astype('uint8')
    print("Columns added.")



In [None]:
add_cols(genres,'g')
df_cleanned.head()


In [None]:
#function to set value to 1 if anime is of the genre
def set_val(column, prefix, df=df_cleanned):
    for i, value in df[column].iteritems():
        value = value.split(', ')
        for h in value:
            df.loc[i, prefix + '_' + h] = 1
    print("Values set.")


In [None]:

set_val('genre','g')
df_cleanned.head()


In [None]:
#df_cleanned.info()

In [None]:
y_genre = df_cleanned[['score']]
X_genre = df_cleanned.drop(['anime_id', 'title', 'type', 'source', 'episodes', 'score', 'scored_by',
       'rank', 'popularity', 'members', 'favorites', 'producer', 'licensor',
       'studio', 'genre', 't_MOVIE', 't_MUSIC', 't_ONA', 't_OVA', 't_SPECIAL',
       't_TV', 't_UNKNOWN', 'src_4-KOMA MANGA', 'src_BOOK', 'src_CARD GAME',
       'src_DIGITAL MANGA', 'src_GAME', 'src_LIGHT NOVEL', 'src_MANGA',
       'src_MUSIC', 'src_NOVEL', 'src_ORIGINAL', 'src_OTHER',
       'src_PICTURE BOOK', 'src_RADIO', 'src_UNKNOWN', 'src_VISUAL NOVEL',
       'src_WEB MANGA'], axis=1)


In [None]:
# split into train/test
Xg_train, Xg_test, yg_train, yg_test = train_test_split(X_genre,y_genre, test_size=0.2)

In [None]:
#train lr model
lr_model2 = LinearRegression().fit(Xg_train,yg_train)

In [None]:
lr_model2.score(Xg_train,yg_train)


In [None]:
print("Intercept of the linear equation:", lr_model2.intercept_) 
print("\nCOefficients of the equation are:", lr_model2.coef_)


In [None]:
yg_hat = pd.DataFrame(lr_model2.predict(Xg_test), columns=['Predicted Score'])
yg_hat.head()


In [None]:

actualScore2 = yg_test.reset_index(drop=True) # Drop the index so that we can concat it, to create new dataframe
df_actual_vs_predicted2 = pd.concat([actualScore2,yg_hat],axis =1)
df_actual_vs_predicted2.T

In [None]:
score_model(yg_test,yg_hat)

Feature Selection

In [None]:
df_cleanned.columns

In [None]:
add_cols(producers,'p')
add_cols(licensors,'l')
add_cols(studios,'st')
set_val('producer','p')
set_val('licensor','l')
set_val('studio','st')


In [None]:
df_cleanned.info()

In [None]:
y_all = df_cleanned['score']
X_all = df_cleanned.drop(['anime_id', 'title', 'type', 'source', 'episodes', 'score', 'scored_by',
       'rank', 'popularity', 'members', 'favorites', 'producer', 'licensor',
       'studio', 'genre'], axis=1)

In [None]:
X_all.info()

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
 


In [None]:
#extracting top 10 best features by applying SelectKBest class
bestfeatures = SelectKBest(score_func=mutual_info_regression, k=10)
fit = bestfeatures.fit(X_all,y_all)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_all.columns)
 
#concat two dataframes
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns


In [None]:
topFeatures = featureScores.nlargest(50,'Score')  #printing 10 best features
topFeatures.T

In [None]:
topFeatures['Specs'].values

In [None]:
y_final = df_cleanned[['score']]
X_final = df_cleanned[topFeatures['Specs'].values]
X_final.head()

In [None]:
Xf_train, Xf_test, yf_train, yf_test = train_test_split(X_final, y_final, test_size=0.2)

In [None]:
lr_model3 = LinearRegression().fit(Xf_train, yf_train)
yf_hat = pd.DataFrame(lr_model3.predict(Xf_test), columns=['Predicted Score'])
score_model(yf_test, yf_hat)

In [None]:
actualScore3 = yf_test.reset_index(drop=True) # Drop the index so that we can concat it, to create new dataframe
df_actual_vs_predicted3 = pd.concat([actualScore3,yf_hat],axis =1)
df_actual_vs_predicted3.T

## Conclusion
Given the available parameters, we cannot accurately predict the score of an Anime with enough confidence that it is not by random chance.

__Reasons:__
- Score given to an anime by a watcher is very subjective.
- Data does not capture the necessary features to predict score with high level of confidence.

__Other Observations:__
- Going in, I thought genre and studio would be a strong feature to predict the score.