## Load CSV to DataFrame

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
corpus_root = 'drive/My Drive/Colab Notebooks/' 

df_train = pd.read_csv(corpus_root+'Boy_or_girl_train.csv')
df_test = pd.read_csv(corpus_root+'Boy_or_girl_test_no_solution.csv')

In [None]:
def modify_data(df):
  
  star_signs = {'水瓶座':1, '雙魚座':2, '牡羊座':3, '金牛座':4, '雙子座':5, '巨蟹座':6, '獅子座':7, '處女座':8, '天秤座':9, '天蠍座':10, '射手座':11, '摩羯座':12}
  df = df.replace({"star_sign": star_signs})

  return df

df_train = modify_data(df_train)

## 有遺漏值的話先插執法補植

In [None]:
df_train.isnull().any()

df_train = df_train.fillna(df_train.interpolate())

df_train

Unnamed: 0,id,timestamp,gender,star_sign,phone_os,height,weight,sleepiness,iq,fb_friends,yt,self_intro
0,1,4/7/2017 13:47,2,8,Apple,154.00,43.00000,2,180.00,583.0,0,Beautiful
1,2,4/7/2017 13:48,2,8,Apple,156.00,47.00000,2,130.00,400.0,3.5,Enjoying being who I'm notsss
2,3,4/7/2017 13:49,1,11,Android,170.00,61.00000,3,90.00,540.0,5,Practice Makes perfect
3,4,4/7/2017 13:49,1,11,Apple,170.00,62.00000,4,100.00,173.0,5,Straightforward
4,5,4/7/2017 13:54,2,11,Android,158.00,67.00000,3,128.00,320.0,1.2,Humorous
...,...,...,...,...,...,...,...,...,...,...,...,...
476,477,13/03/2018 10:18:20,1,2,Android,187.00,87.00000,4,87.00,9487.0,5.9487,I'm a 87.
477,478,13/03/2018 10:18:29,2,5,Android,180.00,88.00000,5,200.00,888.0,1,"I'm a ""Typical Lady""."
478,479,13/03/2018 10:19:40,2,2,Android,158.00,56.00000,3,110.00,589.0,0,I am so cuuuuuuute!
479,480,13/03/2018 10:20:20,1,8,Android,188.77,87.59487,5,50.87,9487.0,8.74,Let's party!


## 將 object 轉 digital

In [None]:
# 把 Object 變成數字的function
from sklearn.preprocessing import LabelEncoder
# df_train['phone_os'] = LabelEncoder().fit_transform(df_train['phone_os'])

def transform_object(df):
  trian_corr = df.corr()
  float_data = trian_corr.index
  all_col = df.columns
  object_data = []
  for i in range(len(all_col)):
      if all_col[i] not in float_data:
          object_data.append(all_col[i])

  for i in object_data:
      df[i] = LabelEncoder().fit_transform(df[i].factorize()[0]) 
  return df

df_train = transform_object(df_train)

## 刪除不重要欄位、重複值與離群值

In [None]:
df_train.drop(['id','timestamp','self_intro','star_sign','phone_os'], axis=1, inplace=True)

# 刪除重復值
df_train = df_train.drop_duplicates()

# 檢查有沒有異常或極端值之類的
df_train.describe().astype(np.float64).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
gender,397.0,1.239295,0.4271913,1.0,1.0,1.0,1.0,2.0
height,397.0,2.518892e+108,5.018856e+109,-1000.0,165.0,172.0,176.0,1e+111
weight,397.0,2.518892e+108,5.018856e+109,-1000.0,55.0,65.0,73.0,1e+111
sleepiness,397.0,3.355164,1.237995,1.0,2.0,3.0,4.0,5.0
iq,397.0,125.5875,38.02821,50.0,100.0,120.0,150.0,200.0
fb_friends,397.0,2.518892e+252,inf,-1000.0,200.0,430.0,722.0,1e+255
yt,397.0,33.01511,34.17926,0.0,8.0,17.0,48.0,128.0


In [None]:
Q1 = df_train.quantile(0.15)
Q3 = df_train.quantile(0.85)
IQR = Q3 - Q1 

# ref: https://www.pluralsight.com/guides/cleaning-up-data-from-outliers & https://medium.com/@prashant.nair2050/hands-on-outlier-detection-and-treatment-in-python-using-1-5-iqr-rule-f9ff1961a414
# 刪掉異常值
df_out = df_train[~((df_train < (Q1 - 1.5 * IQR)) |(df_train > (Q3 + 1.5 * IQR))).any(axis=1)]
# 不要刪掉異常值
# df_out = df_train[~((df_train < (Q1 - 1.5 * IQR)) |(df_train > (Q3 + 1.5 * IQR)))].fillna(df_train.median())

# df_out = df_out[df_out['weight']>20]
print(df_out.shape)
df_out.describe().astype(np.float64).T

(364, 7)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
gender,364.0,1.247253,0.432009,1.0,1.0,1.0,1.0,2.0
height,364.0,170.6675,8.329113,147.0,166.0,171.0,176.0,200.0
weight,364.0,64.147995,12.655874,10.0,55.0,63.0,72.0,110.0
sleepiness,364.0,3.387363,1.19501,1.0,3.0,3.0,4.0,5.0
iq,364.0,125.262363,36.319905,50.0,100.0,120.0,145.0,200.0
fb_friends,364.0,508.68956,426.0295,0.0,200.0,400.0,681.25,2355.0
yt,364.0,31.299451,33.55561,0.0,8.0,17.0,41.25,126.0


In [None]:
df_train = df_out

## 切割訓練與驗證資料集

In [None]:
df_train.describe().transpose()
X = df_train.drop('gender',axis=1)
y = df_train['gender']

## AdaBoost

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, precision_score, f1_score
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

######################
# 訓練模型
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=101)

scaler = MinMaxScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_valid = scaler.transform(X_valid)

model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=4),
                         algorithm="SAMME", learning_rate=0.1,
                         n_estimators=20)

model = model.fit(X_train, y_train)
pred = model.predict(X_valid)

print ('Accuracy = {:0.2f}%'.format(100.0 * accuracy_score(y_valid, pred)))

scores = cross_val_score(model, X_train, y_train, cv=10, scoring = "accuracy")

print(classification_report(y_valid, pred))
print(confusion_matrix(y_valid, pred))
print(pred, y_valid)

Accuracy = 94.52%
              precision    recall  f1-score   support

           1       0.98      0.95      0.97        60
           2       0.80      0.92      0.86        13

    accuracy                           0.95        73
   macro avg       0.89      0.94      0.91        73
weighted avg       0.95      0.95      0.95        73

[[57  3]
 [ 1 12]]
[2 1 1 1 1 1 1 1 2 1 1 1 1 2 2 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 2 1 2 2 1 1
 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 2 2] 95     2
116    1
313    1
358    1
411    1
      ..
351    1
12     2
161    1
427    1
23     2
Name: gender, Length: 73, dtype: int64


In [None]:
df_test = modify_data(df_test)

id = df_test['id']
df_test.drop(['id','timestamp', 'self_intro','star_sign','phone_os','gender'], axis=1, inplace=True)

df_test = transform_object(df_test)
df_test = scaler.transform(df_test)

res = model.predict(df_test)
print(res)

[2 1 1 1 1 1 1 2 1 2 2 2 2 2 1 1 2 2 1 1 1 2 2 1 1 1 1 1 2 1 2 1 2 1 1 1 1
 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 2 1 1 1 2 1 1 2 2 1
 2 1 1 1 1 2 1 1 1 1 1 2 2 1 2 2 1 1 1 1 1 2 1 1 1 1 2 2 2 1 2 1 2 1 1 1 1
 1 1 2 2 2 2 1 1]


In [None]:
# 存擋
submission_df = pd.DataFrame(data= {'Id' : id, 'gender': res})
submission_df.to_csv(corpus_root + 'Ada_submition.csv', index=False)