# This Template is created to make grading fair and straightforward. Anything not in the place as mentioned in the template would not be graded.

<font color='red'> # NOTE: We would run the notebook through a Plagiarism Checker. If it is found to be copied, your work would not be graded, and the incident would be highlighted to NYU Authorities. </font>

# Import Library and Dataset

In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import preprocessing

# PART I: Preprocessing

#### Handling missing values. (If ANY)

In [2]:
#replace missing values with a unique category
data=pd.read_csv("qudditch_training.csv")
columns_replace=["house","player_code","move_specialty"]
for column in columns_replace:
    data[column].replace("?","U",inplace=True)
data["gender"].replace("Unknown/Invalid","U",inplace=True)

#### Feature Datatype Conversion From Numeric to categoric and Vice-versa. (If ANY)

In [6]:
def map_features(features,df,dict):
    for i in features:
        df = df.replace({i:dict})
    return df 

def convert_move_specialty(df):
    dict={}
    for i in df["move_specialty"]:
        if i=="U":
            dict.update({"U":0})
        else:
            dict.update({i:1})
    return dict

#drops id_num,player_id,weight
df=pd.DataFrame(data=data)
#dict for mapping game move style
new_dict={'Steady':1,'No':0,'Up':0,'Down':0}
game_move_columns=["body_blow","checking","dopplebeater_defence","hawkshead_attacking_formation","no_hands_tackle","power_play","sloth_grip_roll","spiral_dive","starfish_and_stick","twirl","wronski_feint","zig-zag","bludger_backbeat","chelmondiston_charge","dionysus_dive","double_eight_loop","finbourgh_flick","reverse_pass","parkins_pincer","plumpton_pass","porskoff_ploy","transylvanian_tackle","woollongong_shimmy"]
df=map_features(game_move_columns,df,new_dict)
#converts target and  to 0 or 1
ordered_satisfaction = ["NO","YES"]
cat_dtype = pd.api.types.CategoricalDtype(ordered_satisfaction, ordered=True)
df["quidditch_league_player"]=df["quidditch_league_player"].astype(cat_dtype).cat.codes
df.drop(["id_num","player_id","weight"], axis=1,inplace=True)
ordered_satisfaction = ["No","Yes"]
cat_dtype = pd.api.types.CategoricalDtype(ordered_satisfaction, ordered=True)
df["snitch_caught"]=df["snitch_caught"].astype(cat_dtype).cat.codes
ordered_satisfaction = ["No","Ch"]
cat_dtype = pd.api.types.CategoricalDtype(ordered_satisfaction, ordered=True)
df["change"]=df["change"].astype(cat_dtype).cat.codes
#does one-hot encoding
df=pd.get_dummies(df, columns=["house","foul_type_id","game_move_id","penalty_id","player_code","player_type"])
ordered_satisfaction = ["None", "Norm",  ">200", ">300"]
cat_dtype = pd.api.types.CategoricalDtype(ordered_satisfaction, ordered=True)
df["snitchnip"]=df["snitchnip"].astype(cat_dtype).cat.codes
ordered_satisfaction = ["None", "Norm",  ">7", ">8"]
cat_dtype = pd.api.types.CategoricalDtype(ordered_satisfaction, ordered=True)
df["stooging"]=df["stooging"].astype(cat_dtype).cat.codes
df = df[df.gender != 'U']
ordered_satisfaction = ["Female","Male"]
cat_dtype = pd.api.types.CategoricalDtype(ordered_satisfaction, ordered=True)
df["gender"]=df["gender"].astype(cat_dtype).cat.codes
dict=convert_move_specialty(df)
df=map_features(["move_specialty"],df,dict)

df.head()

Unnamed: 0,gender,age,game_duration,move_specialty,num_game_moves,num_game_losses,num_practice_sessions,num_games_satout,num_games_injured,num_games_notpartof,...,player_code_WC,player_type_Beater1,player_type_Beater2,player_type_Captain,player_type_Chaser1,player_type_Chaser2,player_type_Chaser3,player_type_Keeper,player_type_Multiple,player_type_Seeker
0,0,11.0,1,1,41,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,12.0,3,0,59,0,18,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,13.0,2,0,11,5,13,2,0,1,...,0,0,0,0,0,0,0,0,1,0
3,1,14.0,2,0,44,1,16,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1,14.5,1,0,51,0,8,0,0,0,...,0,0,0,1,0,0,0,0,0,0


#### Feature Reduction or extraction. (If ANY)

In [7]:
norm_columns=["age","game_duration","num_game_moves","num_game_losses","num_practice_sessions","num_games_satout","num_games_injured","num_games_notpartof","num_games_won","snitchnip","stooging"]
scaler = preprocessing.MinMaxScaler()
for i in norm_columns:
    df[i] = scaler.fit_transform(df[i].values.reshape(-1,1))

df_target=pd.DataFrame(data=df["quidditch_league_player"])
df.drop(["quidditch_league_player"], axis=1,inplace=True)

df.head()



Unnamed: 0,gender,age,game_duration,move_specialty,num_game_moves,num_game_losses,num_practice_sessions,num_games_satout,num_games_injured,num_games_notpartof,...,player_code_WC,player_type_Beater1,player_type_Beater2,player_type_Captain,player_type_Chaser1,player_type_Chaser2,player_type_Chaser3,player_type_Keeper,player_type_Multiple,player_type_Seeker
0,0,0.0,0.0,1,0.305344,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
1,0,0.166667,0.153846,0,0.442748,0.0,0.2125,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
2,0,0.333333,0.076923,0,0.076336,0.833333,0.15,0.047619,0.0,0.047619,...,0,0,0,0,0,0,0,0,1,0
3,1,0.5,0.076923,0,0.328244,0.166667,0.1875,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
4,1,0.583333,0.0,0,0.381679,0.0,0.0875,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0


#### Any other Pre-processing Used. (Give the name along with the code.)

In [9]:
array=df.values
array = StandardScaler().fit_transform(array)
pca = PCA(n_components=len(df.columns)-9)
array_new = pca.fit_transform(array)
df_new=pd.DataFrame(array_new)
df_new.insert(len(df_new.columns),"quidditch_league_player", df_target)
df_new.to_csv("data_standard_scale_pca_minus_9_features.csv",index=False)

df_new.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,104,105,106,107,108,109,110,111,112,quidditch_league_player
0,2.694347,-4.843346,4.578653,1.389695,-0.303357,-0.50625,1.114255,1.0063,-3.496875,4.418681,...,2.260948,1.129428,-1.712971,-2.126725,-1.683431,1.013483,-0.468869,0.778282,-0.258711,0.0
1,-1.568306,-1.568537,0.069608,0.617192,0.761752,-0.774245,1.7204,-0.138077,-1.443468,-0.236124,...,1.22244,0.947712,-2.394981,0.982288,-1.108395,1.912125,-0.442068,-0.235546,0.301999,0.0
2,-0.962683,-2.917688,0.068955,0.833512,0.13928,1.237387,0.968087,-1.322287,0.670119,-0.712175,...,-0.931244,2.980536,0.27352,-0.138885,-0.552866,1.155714,-0.54675,-0.804475,0.379823,0.0
3,-1.233747,-1.364134,0.190202,0.057323,0.521935,-1.256113,0.813951,-0.473221,-1.128587,-0.438858,...,0.838083,1.092463,-1.485074,1.120071,0.319581,0.515393,0.157112,-0.364844,0.218197,0.0
4,-0.884897,-1.564178,-0.080987,0.035225,0.925869,-1.666163,-0.295315,-1.079295,-0.258414,0.201346,...,1.198372,0.485184,-1.202516,0.156197,-0.013096,0.783498,0.115976,0.927832,0.10962,0.0


# PART II: Classification

### Model 1:
Model Name:-----------<br>
Evaluation method and metric used Name:-----------<br>
Name of the Hyperparameter used:--------------......<br>


In [5]:
#Code...

### Model 2:
Model Name:-----------<br>
Evaluation method and metric used Name:-----------<br>
Name of the Hyperparameter used:--------------......<br>


In [None]:
#Code...

### Model 3:
Model Name:-----------<br>
Evaluation method and metric used Name:-----------<br>
Name of the Hyperparameter used:--------------......<br>


In [None]:
#Code...

# PART III: Best Hypothesis:
Model Name:------------<br>
Reason:--------------<br>
Hyper-parameter Value:-----------<br>
