# This Template is created to make grading fair and straightforward. Anything not in the place as mentioned in the template would not be graded.

<font color='red'> # NOTE: We would run the notebook through a Plagiarism Checker. If it is found to be copied, your work would not be graded, and the incident would be highlighted to NYU Authorities. </font>

# Import Library and Dataset

In [11]:
import pandas as pd
import sys
import numpy as np
from sklearn.preprocessing import StandardScaler
#import csv training file given to us in the beginning of the project
#we assume the file is in the same directory of this notebook
df=pd.read_csv("qudditch_training.csv")

# PART I: Preprocessing

#### Handling missing values. (If ANY)

In [12]:
#drop weight, finbourgh_flick, double_eight_loop due to missing information 
#and drop player_id due to not related to the target
df.drop(["weight","finbourgh_flick", "double_eight_loop","player_id"], axis=1,inplace=True)

#handling missing values by creating another category named 'U'
columns_replace=["house","player_code","move_specialty"]
for column in columns_replace:
	df[column].replace("?","U",inplace=True)
df["gender"].replace("Unknown/Invalid","U",inplace=True)

#drop category 'U' from gender
#only very few of rows have unknown type
df = df[df.gender != 'U']


#### Feature Datatype Conversion From Numeric to categoric and Vice-versa. (If ANY)

In [13]:
#define function for encoding (mapping)
def map_features(features,df,dict):
	for i in features:
		df = df.replace({i:dict})

	return df

#reducing nominal values in snitchnip and stooging
foul_dict={'None':'none','Norm':'norm','>7':'high','>8':'high','>200':'high','>300':'high'}
foul_columns=["snitchnip","stooging"]
df=map_features(foul_columns,df,foul_dict)

#generate move specialty dict for reducing nominal values
#1 stands for with specialty, 0 stands for without specialty
def convert_move_specialty(df):
	dict={}
	for i in df["move_specialty"]:
		if i=="U":
			dict.update({"U":0})
		else:
			dict.update({i:1})
	return dict

move_spec_dict=convert_move_specialty(df)
df=map_features(["move_specialty"],df,move_spec_dict)

#23 tactics feature, ready for conversion
tactics_columns=["body_blow","checking","dopplebeater_defence","hawkshead_attacking_formation","no_hands_tackle","power_play","sloth_grip_roll","spiral_dive","starfish_and_stick","twirl","wronski_feint","zig-zag","bludger_backbeat","chelmondiston_charge","dionysus_dive","reverse_pass","parkins_pincer","plumpton_pass","porskoff_ploy","transylvanian_tackle","woollongong_shimmy"]

#make a copy of dataframe for future use(feature reduction and extraction) before encoding
df_tactics_change=df.copy()

#convert tactics
#Steady, Up, Down to 1, No to 0
tactics_dict={'Steady':1,'No':0,'Up':1,'Down':1}
df=map_features(tactics_columns,df,tactics_dict)

#convert gender
#Female to 0, Male to 1

ordered_satisfaction = ["Female","Male"]
cat_dtype = pd.api.types.CategoricalDtype(ordered_satisfaction, ordered=True)
df["gender"]=df["gender"].astype(cat_dtype).cat.codes

#convert snitch_caught
#No to 0, Yes to 1

ordered_satisfaction = ["No","Yes"]
cat_dtype = pd.api.types.CategoricalDtype(ordered_satisfaction, ordered=True)
df["snitch_caught"]=df["snitch_caught"].astype(cat_dtype).cat.codes

#convert change
#No to 0,Ch to 1

ordered_satisfaction = ["No","Ch"]
cat_dtype = pd.api.types.CategoricalDtype(ordered_satisfaction, ordered=True)
df["change"]=df["change"].astype(cat_dtype).cat.codes

#covert target
#NO to 0, YES to 1
#ignore this part when transforming test data

ordered_satisfaction = ["NO","YES"]
cat_dtype = pd.api.types.CategoricalDtype(ordered_satisfaction, ordered=True)
df["quidditch_league_player"]=df["quidditch_league_player"].astype(cat_dtype).cat.codes

#one-hot encoding rest of columns

df=pd.get_dummies(df, columns=["house","foul_type_id","game_move_id","penalty_id","player_code","player_type","snitchnip","stooging"])



#### Feature Reduction or extraction. (If ANY)

In [14]:
#sum num_games_satout, num_games_injured, num_games_notpartof and combine them into one feature named num_game_not_participate

df["num_game_not_participate"]=df.num_games_satout+df.num_games_injured+df.num_games_notpartof

#sum up number of tactic changes into one feature named num_tactics_change

#encoding dictionary for helping calculation
#Up and Down count for change
tactics_change_dict={'Steady':0,'No':0,'Up':1,'Down':1}

#do encoding in the copy of dataframe, help calculation
df_tactics_change=map_features(tactics_columns,df_tactics_change,tactics_change_dict)

#initialize column filled by 0
df["num_tactics_change"]=0

#define function for sum change of tactics
def sum_change_tactics(df,df_copy,columns):

	for i in columns:

		df["num_tactics_change"]+=df_copy[i]

sum_change_tactics(df,df_tactics_change,tactics_columns)


#sum up number of tactics used by each player
#create new column named num_total_tactics

df["num_total_tactics"]=0
def sum_tactics(df,columns):

	for i in columns:
		
		df["num_total_tactics"]+=df[i]

	return df

sum_tactics(df,tactics_columns)

#move target to the last column
#ignore this part when transforming test data
df_target=df["quidditch_league_player"]
df.drop(["quidditch_league_player"], axis=1,inplace=True)
df.insert(len(df.columns),"quidditch_league_player", df_target)



#### Any other Pre-processing Used. (Give the name along with the code.)

In [15]:
#log transform

log_transform_columns=["num_games_satout","num_games_injured","num_games_notpartof"]
def log_transform(df,columns):

	for i in columns:
		#add 1 to original values to perform log transform
		df[i]+=1
		df[i]=df[i].apply(np.log)

log_transform(df,log_transform_columns)

#Standardization (v-mean)/std

numeric_columns=["game_duration","num_game_moves","num_game_losses","num_practice_sessions","num_games_satout","num_games_injured","num_games_notpartof","num_games_won","age","num_total_tactics","num_game_not_participate","num_tactics_change"]
def standardize_numeric_value(df,columns):
	scaler = StandardScaler()
	for i in columns:

		df[i]=scaler.fit_transform(df[i].values.reshape(-1,1))

standardize_numeric_value(df,numeric_columns)

#remove outliers
def remove_outliers(df,columns):

	for i in columns:
		
		df = df[np.abs(df[i] - df[i].mean()) <= (3 * df[i].std())]
		
remove_outliers(df,numeric_columns)

#generate correlation matrix to observe
df_corr=df.corr()
df_corr.to_csv("correlation.csv")

df.to_csv("data_aftercleaned.csv",index=False)





# PART II: Classification

### Model 1:
Model Name: Logistic Regression<br>
Evaluation method and metric used Name: 5-fold cross validation and the average F1-score for both classes<br>
Name of the Hyperparameter used: training sampling, C, class_weight, penalty, solver, max_iter=1000 (fixed)<br>


In [16]:
from sklearn.model_selection import KFold 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from collections import Counter
from sklearn.utils import resample

RANDOM_STATE = 42


def get_cross_validation_folds(k):
  kf = KFold(n_splits=k, random_state=RANDOM_STATE, shuffle=True)
  return kf

def separate_features_and_target(examples, target_index, feature_indices):
  X = np.array([element[feature_indices] for element in examples])
  y = np.array([element[target_index] for element in examples])  
  return X, y

def undersample_training(training_examples, training_classes):
  mcc = Counter(training_classes).most_common()[0] #lcc is the Most Common Class
  lcc = Counter(training_classes).most_common()[-1]
  examples_in_mcc = [elem for i, elem in enumerate(training_examples) if training_classes[i] == mcc[0]]
  number_of_samples = len(training_examples) - len(examples_in_mcc) #we undersample to the same number of samples in the least common class
  new_samples = resample(examples_in_mcc, n_samples=number_of_samples, random_state=RANDOM_STATE)
  training_examples = np.concatenate((new_samples, [elem for i, elem in enumerate(training_examples) if training_classes[i] == lcc[0]]))
  training_classes = np.concatenate(([mcc[0] for i in new_samples], [lcc[0] for i in xrange(number_of_samples)]))
  return training_examples, training_classes

def bootstrap_training(training_examples, training_classes):
  #this is where we perform oversampling (SMOTE)
  lcc = Counter(training_classes).most_common()[-1] #lcc is the Least Common Class
  examples_in_lcc = [elem for i, elem in enumerate(training_examples) if training_classes[i] == lcc[0]]
  #by doing so, you end up with the same number of samples per class  
  number_of_samples = len(training_examples) - 2*len(examples_in_lcc) 
  new_samples = resample(examples_in_lcc, n_samples=number_of_samples, random_state=RANDOM_STATE)
  training_examples = np.concatenate((training_examples, new_samples))
  training_classes = np.concatenate((training_classes, [lcc[0] for i in new_samples]))
  return training_examples, training_classes

def get_fmeasure(precision, recall):
  return (2.*precision*recall)/(precision+recall)

def process_input(filename):
  #ignoring header  
  lines = [np.array([float(i) for i in l.strip().split(',')]) for l in open(filename, 'r').readlines()[1:]] 
  return np.array(lines)

####### The functions in this cell down until here are useful for all models

def logistic_regression(examples, kfolds, target_index, feature_indices, params={}, sampling=None):
  lr = LogisticRegression(**params)
  precs = []; recs = []; accs = []; fmeasures = []
  for train_index, test_index in kfolds:
    X_train, y_train = separate_features_and_target(examples[train_index], target_index, feature_indices)
    if sampling == 'over': #bootstrap training samples in the minority class 
      X_train, y_train = bootstrap_training(X_train, y_train)
    elif sampling == 'under': #reduce the number of samples in the majority class
      X_train, y_train = undersample_training(X_train, y_train)
    clf = lr.fit(X_train, y_train)
    X_test, y_test = separate_features_and_target(examples[test_index], target_index, feature_indices)
    y_pred = clf.predict(X_test)
    results = precision_recall_fscore_support(y_test, y_pred)
    precs.append(results[0])
    recs.append(results[1])
    accs.append(clf.score(X_test, y_test))
    fmeasures.append((get_fmeasure(results[0][0], results[1][0]), get_fmeasure(results[0][1], results[1][1])))
  return accs, precs, recs, fmeasures


#below, we assume that the dataset generated in Part I is in the same folder of this notebook

examples = process_input('data_aftercleaned.csv') 
kfolds = get_cross_validation_folds(5)
SKIP = 1 
num_features = len(examples[0]) - 1 #one field is the target

### Now, we focus on Logistic Regression

best_params_logreg = {'C': 0.01, 'class_weight': None, 'penalty': 'l2', 'solver': 'sag', 'max_iter':1000}
sampling_logreg = 'over'
print 'LOGISTIC REGRESSION'
accs, precs, recs, fmeasures = logistic_regression(examples, kfolds.split(examples), -1,
                                                   np.array([i + SKIP for i in xrange(num_features - SKIP)]),
                                                   params=best_params_logreg, sampling=sampling_logreg)
print 'average accuracy for folds', np.mean(accs)
mean_overall_fmeasure = (np.mean([i[0] for i in fmeasures]) + np.mean([i[1] for i in fmeasures]))/2
print 'average fmeasure for both classes', mean_overall_fmeasure 


LOGISTIC REGRESSION
average accuracy for folds 0.661406468103
average fmeasure for both classes 0.526098499947


### Model 2:
Model Name: Adaboost<br>
Evaluation method and metric used Name: 5-fold cross validation and the average F1-score for both classes<br>
Name of the Hyperparameter used: Oversampling (SMOTE), weak classifiers are Decision Trees with max_depth = 1, n_estimators, and algorithm.<br>


In [17]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

def adaboost(examples, kfolds, target_index, feature_indices, params={}, sampling=None):
  ab =  AdaBoostClassifier(**params)
  precs = []; recs = []; accs = []; fmeasures = []
  for train_index, test_index in kfolds:
    X_train, y_train = separate_features_and_target(examples[train_index], target_index, feature_indices)
    if sampling == 'over': #bootstrap training samples in the minority class 
      X_train, y_train = bootstrap_training(X_train, y_train)
    elif sampling == 'under': #reduce the number of samples in the majority class
      X_train, y_train = undersample_training(X_train, y_train)
    clf = ab.fit(X_train, [int(i) for i in y_train])
    X_test, y_test = separate_features_and_target(examples[test_index], target_index, feature_indices)
    y_pred = clf.predict(X_test)
    results = precision_recall_fscore_support(y_test, y_pred)
    precs.append(results[0])
    recs.append(results[1])
    accs.append(clf.score(X_test, y_test))
    fmeasures.append((get_fmeasure(results[0][0], results[1][0]), get_fmeasure(results[0][1], results[1][1])))
  return accs, precs, recs, fmeasures 

weak_learner = DecisionTreeClassifier(max_depth=1)
best_params_adaboost = {'base_estimator': weak_learner, 'n_estimators': 200, 'algorithm': 'SAMME'}
sampling_adaboost = 'over'
print 'ADABOOST'
accs, precs, recs, fmeasures = adaboost(examples, kfolds.split(examples), -1,
                                                   np.array([i + SKIP for i in xrange(num_features - SKIP)]),
                                                   params=best_params_adaboost, sampling=sampling_adaboost)
print 'average accuracy for folds', np.mean(accs)
mean_overall_fmeasure = (np.mean([i[0] for i in fmeasures]) + np.mean([i[1] for i in fmeasures]))/2
print 'average fmeasure for both classes', mean_overall_fmeasure 


ADABOOST
average accuracy for folds 0.693422251575
average fmeasure for both classes 0.534905655781


### Model 3:
Model Name: Random Forest <br>
Evaluation method and metric used Name:  5-fold cross validation and the average F1-score for both classes<br>
Name of the Hyperparameter used: Oversampling (SMOTE), n_estimators, criterion, max_depth, and max_features<br>


In [18]:
from sklearn.ensemble import RandomForestClassifier

def random_forest(examples, kfolds, target_index, feature_indices, params={}, sampling=None):
  rf = RandomForestClassifier(**params)
  precs = []; recs = []; accs = []; fmeasures = []
  for train_index, test_index in kfolds:
    X_train, y_train = separate_features_and_target(examples[train_index], target_index, feature_indices)
    if sampling == 'over': 
      X_train, y_train = bootstrap_training(X_train, y_train)
    elif sampling == 'under': 
      X_train, y_train = undersample_training(X_train, y_train)
    clf = rf.fit(X_train, [int(i) for i in y_train])
    X_test, y_test = separate_features_and_target(examples[test_index], target_index, feature_indices)
    y_pred = clf.predict(X_test)
    results = precision_recall_fscore_support(y_test, y_pred)
    precs.append(results[0])
    recs.append(results[1])
    accs.append(clf.score(X_test, y_test))
    fmeasures.append((get_fmeasure(results[0][0], results[1][0]), get_fmeasure(results[0][1], results[1][1])))
  return accs, precs, recs, fmeasures 

best_params_ranfor = {'n_estimators': 5, 'criterion': 'entropy', 'max_depth': 32, 'max_features': None}
sampling_ranfor = 'over'
print 'RANDOM FOREST'
accs, precs, recs, fmeasures = random_forest(examples, kfolds.split(examples), -1,
                                                   np.array([i + SKIP for i in xrange(num_features - SKIP)]),
                                                   params=best_params_ranfor, sampling=sampling_ranfor)
print 'average accuracy for folds', np.mean(accs)
mean_overall_fmeasure = (np.mean([i[0] for i in fmeasures]) + np.mean([i[1] for i in fmeasures]))/2
print 'average fmeasure for both classes', mean_overall_fmeasure 

RANDOM FOREST
average accuracy for folds 0.82739993048
average fmeasure for both classes 0.546336883784


# PART III: Best Hypothesis:
Model Name: Random Forest <br>
Reason: Highest F-measure (F1 Score) and highest accuracy. <br>
Hyper-parameter Value: n_estimators = 5, criterion = 'entropy', max_depth = 32, max_features = None<br>
<br>
Here, we add code that (1) reads one training and one test file without targets, (2) performs data processing on both files, and (3) generates predictions for the examples in the test file, saving them to disk. We assume that the training and the test files are in the same directory. Please change their names if necessary.

In [19]:
def process_generic_input(filename):
  #assumes that file has a header
  lines = open(filename, 'r').readlines()
  examples = [np.array([float(i) for i in l.strip().split(',')]) for l in lines[1:]]
  header = lines[0]
  features = header.strip().split(',')
  return np.array(examples), np.array(features)

def determine_feature_intersection(features_training, features_test):
  #there is no guarantee that the features in training and test are 
  #the same after processing, so we force our code to work with an
  #intersection  
  training_dict = {}
  test_dict = {}
  for index, elem in enumerate(features_training):
    training_dict[elem] = index
  for index, elem in enumerate(features_test):
    test_dict[elem] = index
  key_intersection = list(set(training_dict.keys()) & set(test_dict.keys()))
  training_indices = []
  test_indices = []
  for relevant_key in key_intersection:
    training_indices.append(training_dict[relevant_key])
    test_indices.append(test_dict[relevant_key])
  return sorted(training_indices), sorted(test_indices)

def random_forest_generic(examples_training, examples_test, target_index, 
                          features_training, features_test, params={}, sampling=None):
  rf = RandomForestClassifier(**params)
  precs = []; recs = []; accs = []; fmeasures = []
  X_train, y_train = separate_features_and_target(examples_training, target_index, features_training)
  #bootstrap training samples in the minority class 
  X_train, y_train = bootstrap_training(X_train, y_train)
  clf = rf.fit(X_train, y_train)
  X_test = np.array([element[features_test] for element in examples_test])
  y_pred = clf.predict(X_test)
  with open('test_outputs.csv', 'w') as f:
    f.write('id_num,quidditch_league_player\n')
    for index, elem in enumerate(y_pred):
      if int(elem) == 0:
        f.write(str(index+1) + str(',') + 'NO\n')
      else:
        f.write(str(index+1) + str(',') + 'YES\n')


#FIXME change code below to process it all directly from raw data
#ASSUMING CLEANED TRAINING AND TEST DATA AVAILABLE, FOLLOWING PART I
examples_training, features_training = process_generic_input("data_aftercleaned.csv")
examples_test, features_test = process_generic_input("test_data_aftercleaned.csv")
relevant_training_ids, relevant_test_ids = determine_feature_intersection(features_training, features_test)
best_sampling = 'over'
best_params_ranfor = {'n_estimators': 5, 'criterion': 'entropy', 'max_depth': 32, 'max_features': None}
random_forest_generic(examples_training, examples_test, -1, relevant_training_ids, relevant_test_ids, 
                      params=best_params_ranfor, sampling=best_sampling)