In [1]:
import numpy as np
import pandas as pd
import os
import copy

import matplotlib.pyplot as plt
import seaborn as sns    # 基於 matplotlib 的資料視覺化套件
plt.style.use('ggplot')       # 設定圖型風格
sns.set( color_codes = True )


# 計算眾數(Mode)
from collections import defaultdict  
# 資料正規化的套件
from sklearn.preprocessing import MinMaxScaler 
# 資料標準化的套件
from sklearn.preprocessing import StandardScaler  

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score


# 分類問題：LogisticRegression；迴歸問題：LinearRegression
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso 

# 決策樹：
# 分類問題：DecisionTreeClassifier；迴歸問題：DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

# 隨機森林
# 分類問題：RandomForestClassifier；迴歸問題：RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# 梯度提升機
# 分類問題：GradientBoostingClassifier；迴歸問題：GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

# 評估模型
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

# Loading Raw Data

In [2]:
dir_data = '/Users/YenLin/Desktop/ML_100/每日作業/D48_data-science-london-scikit-learn/'
training_data_name = 'train.csv'
test_data_name = 'test.csv'
trainLabels_name = 'trainLabels.csv'

training_data = os.path.join( dir_data, training_data_name )
test_data = os.path.join( dir_data, test_data_name )
train_labels = os.path.join( dir_data, trainLabels_name )

print( 'Path of Training Data: %s' % ( training_data ) )
print( 'Path of Test Data: %s' % ( test_data ) )
print( 'Path of Train Label Data: %s' % ( train_labels ) )

# 資料讀取
training_data = pd.read_csv( training_data, header = None, encoding = 'utf-8' )
testing_data = pd.read_csv( test_data, header = None, encoding = 'utf-8' )
train_labels = pd.read_csv( train_labels, header = None, encoding = 'utf-8' )

Path of Training Data: /Users/YenLin/Desktop/ML_100/每日作業/D48_data-science-london-scikit-learn/train.csv
Path of Test Data: /Users/YenLin/Desktop/ML_100/每日作業/D48_data-science-london-scikit-learn/test.csv
Path of Train Label Data: /Users/YenLin/Desktop/ML_100/每日作業/D48_data-science-london-scikit-learn/trainLabels.csv


# Preview Raw Data

In [3]:
print(  'Size of Training Data = ' + str( training_data.shape ) + '\n' )
training_data.head( )

Size of Training Data = (1000, 40)



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,0.299403,-1.226624,1.498425,-1.17615,5.289853,0.208297,2.404498,1.594506,-0.051608,0.663234,...,-0.850465,-0.62299,-1.833057,0.293024,3.552681,0.717611,3.305972,-2.715559,-2.682409,0.10105
1,-1.174176,0.332157,0.949919,-1.285328,2.199061,-0.151268,-0.427039,2.619246,-0.765884,-0.09378,...,-0.81975,0.012037,2.038836,0.468579,-0.517657,0.422326,0.803699,1.213219,1.382932,-1.817761
2,1.192222,-0.414371,0.067054,-2.233568,3.658881,0.089007,0.203439,-4.219054,-1.184919,-1.24031,...,-0.604501,0.750054,-3.360521,0.856988,-2.751451,-1.582735,1.672246,0.656438,-0.932473,2.987436
3,1.57327,-0.580318,-0.866332,-0.603812,3.125716,0.870321,-0.161992,4.499666,1.038741,-1.092716,...,1.022959,1.275598,-3.48011,-1.065252,2.153133,1.563539,2.767117,0.215748,0.619645,1.883397
4,-0.613071,-0.644204,1.112558,-0.032397,3.490142,-0.011935,1.443521,-4.290282,-1.761308,0.807652,...,0.513906,-1.803473,0.518579,-0.205029,-4.744566,-1.520015,1.830651,0.870772,-1.894609,0.408332


In [4]:
print(  'Size of Testing Data = ' + str( testing_data.shape ) + '\n' )
testing_data.head( )

Size of Testing Data = (9000, 40)



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,2.808909,-0.242894,-0.546421,0.255162,1.749736,-0.030458,-1.322071,3.578071,-0.667578,-0.884257,...,-0.261688,-0.224375,-1.675606,-0.479584,-0.244388,-0.672355,0.51786,0.010665,-0.419214,2.818387
1,-0.374101,0.537669,0.081063,0.756773,0.915231,2.557282,3.703187,1.673835,-0.764122,-1.22804,...,-0.969463,0.574154,-2.200519,-1.61224,0.179031,-2.924596,0.64361,-1.470939,-0.067408,-0.976265
2,-0.08837,0.154743,0.380716,-1.176126,1.699867,-0.258627,-1.384999,1.093584,1.596633,0.230631,...,-0.769885,-0.005143,1.46749,0.483803,-3.542981,0.814561,-1.652948,1.265866,-1.749248,1.773784
3,-0.685635,0.501283,1.873375,0.215224,-3.983468,-0.103637,4.136113,-0.225431,-1.515015,-1.071763,...,0.968609,2.386412,-0.131219,0.285646,2.302069,1.255588,-1.56309,-0.125258,-1.030761,-2.945329
4,0.350867,0.721897,-0.477104,-1.748776,-2.627405,1.075433,4.954253,-3.293501,-0.760369,0.20436,...,0.260553,-2.04565,-2.173227,0.372992,0.4507,-0.211657,1.301359,-0.522164,2.484883,0.039213


In [5]:
print(  'Size of Training Labels = ' + str( train_labels.shape ) + '\n' )
train_labels.head( )

Size of Training Labels = (1000, 1)



Unnamed: 0,0
0,1
1,0
2,0
3,1
4,0


# Numbers of Missing Values

In [6]:
# training data
missing_train = training_data.isnull( ).sum( )  
missing_train = missing_train[ missing_train > 0 ]
missing_train.sort_values( inplace = True )

Train_NaN = pd.DataFrame( { 'col_name' : missing_train.index, 'counts' : missing_train.values } )  # Convert Series to DataFrame
Train_NaN

Unnamed: 0,col_name,counts


In [7]:
# test data
missing_test = testing_data.isnull( ).sum( )  
missing_test = missing_test[ missing_test > 0 ]
missing_test.sort_values( inplace = True )

Test_NaN = pd.DataFrame( { 'col_name' : missing_test.index, 'counts' : missing_test.values } )  # Convert Series to DataFrame
Test_NaN

Unnamed: 0,col_name,counts


# Merge Data

In [8]:
# 資料串聯
data = pd.concat( [ training_data, testing_data ], sort = False, ignore_index = True )
print( 'Size of Data = ' + str( data.shape ) )

data.head( )

Size of Data = (10000, 40)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,0.299403,-1.226624,1.498425,-1.17615,5.289853,0.208297,2.404498,1.594506,-0.051608,0.663234,...,-0.850465,-0.62299,-1.833057,0.293024,3.552681,0.717611,3.305972,-2.715559,-2.682409,0.10105
1,-1.174176,0.332157,0.949919,-1.285328,2.199061,-0.151268,-0.427039,2.619246,-0.765884,-0.09378,...,-0.81975,0.012037,2.038836,0.468579,-0.517657,0.422326,0.803699,1.213219,1.382932,-1.817761
2,1.192222,-0.414371,0.067054,-2.233568,3.658881,0.089007,0.203439,-4.219054,-1.184919,-1.24031,...,-0.604501,0.750054,-3.360521,0.856988,-2.751451,-1.582735,1.672246,0.656438,-0.932473,2.987436
3,1.57327,-0.580318,-0.866332,-0.603812,3.125716,0.870321,-0.161992,4.499666,1.038741,-1.092716,...,1.022959,1.275598,-3.48011,-1.065252,2.153133,1.563539,2.767117,0.215748,0.619645,1.883397
4,-0.613071,-0.644204,1.112558,-0.032397,3.490142,-0.011935,1.443521,-4.290282,-1.761308,0.807652,...,0.513906,-1.803473,0.518579,-0.205029,-4.744566,-1.520015,1.830651,0.870772,-1.894609,0.408332


# Normalization

In [9]:
# 設定縮放的區間上下限
Min_Max_Scaler = MinMaxScaler( feature_range = ( 0, 1 ) ) 

MinMax_data = pd.DataFrame( )

for col in data.columns :
    X = data[ col ].values.reshape( -1, 1 ) 
    scaled_value = Min_Max_Scaler.fit_transform( X )
    MinMax_data[ col ] = scaled_value.reshape( -1 )

    
MinMax_data.head( )

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,0.521848,0.385,0.696357,0.337709,0.596758,0.459816,0.678536,0.620093,0.514675,0.570771,...,0.452269,0.403072,0.39028,0.563861,0.790045,0.573629,0.698835,0.143881,0.358967,0.462177
1,0.323043,0.584283,0.621256,0.322782,0.515096,0.415239,0.512366,0.687253,0.422438,0.470981,...,0.456028,0.48084,0.616589,0.583676,0.530993,0.534213,0.557137,0.655388,0.624663,0.351963
2,0.642301,0.488843,0.500375,0.193134,0.553666,0.445027,0.549366,0.239082,0.368326,0.319846,...,0.482374,0.57122,0.301001,0.627517,0.388825,0.266564,0.606321,0.582898,0.473336,0.627967
3,0.69371,0.467627,0.372577,0.415962,0.539579,0.541891,0.52792,0.810493,0.655478,0.339302,...,0.681567,0.635581,0.294011,0.410551,0.700972,0.686549,0.668321,0.525522,0.574777,0.564552
4,0.398744,0.459459,0.643525,0.494089,0.549207,0.432513,0.622141,0.234414,0.293894,0.589808,...,0.619261,0.258504,0.527731,0.507645,0.261976,0.274937,0.615291,0.610803,0.410455,0.479827


# Split Data into Training and Testing Data

In [10]:
training_data = MinMax_data.head( training_data.shape[0] )
testing_data = MinMax_data.tail( testing_data.shape[0] )

print( 'Size of Training Data = ' + str( training_data.shape ) )
print( 'Size of Testing Data = ' + str( testing_data.shape ) )

Size of Training Data = (1000, 40)
Size of Testing Data = (9000, 40)


# Select Futures

In [11]:
estimator = GradientBoostingClassifier( )
estimator.fit( training_data.values, train_labels )
feats = pd.Series( data = estimator.feature_importances_, index = training_data.columns )
feats = feats.sort_values( ascending = False )
feats

  y = column_or_1d(y, warn=True)


14    0.257958
12    0.135360
29    0.088485
39    0.076895
36    0.075447
18    0.065429
32    0.063359
34    0.038276
4     0.036231
6     0.033138
28    0.022373
7     0.022002
23    0.014753
22    0.011260
38    0.008320
2     0.006249
20    0.005681
3     0.005079
1     0.003537
0     0.003010
11    0.002945
21    0.002580
26    0.002436
13    0.001978
35    0.001940
5     0.001898
25    0.001757
9     0.001584
15    0.001233
33    0.001115
8     0.001113
24    0.001034
27    0.000944
10    0.000922
19    0.000863
17    0.000830
30    0.000787
16    0.000627
37    0.000541
31    0.000029
dtype: float64

In [12]:
Selected_Features = list( feats[ : 20 ].index )
Selected_Features

[14, 12, 29, 39, 36, 18, 32, 34, 4, 6, 28, 7, 23, 22, 38, 2, 20, 3, 1, 0]

# Build the Model

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split( training_data[ Selected_Features ], train_labels, test_size = 0.15, random_state = 1 )

In [14]:
RFC = RandomForestClassifier( n_estimators = 8, max_depth = 5 )
RFC.fit( training_data[ Selected_Features ], train_labels )

Y_pred = RFC.predict( X_test )
acc = metrics.accuracy_score( Y_test, Y_pred )
print( "Acuuracy = ", acc, '\n' )

Acuuracy =  0.94 



  


In [15]:
Y_predictions = RFC.predict( testing_data[ Selected_Features ] )

In [18]:
submit = pd.DataFrame( { 'ID' : np.arange(1,submit.shape[0] + 1 ) , 'Solution' : Y_predictions } )
submit.to_csv( '/Users/YenLin/Desktop/ML_100/每日作業/(Submit)_Day_048_HW.csv', index = False )
submit

Unnamed: 0,ID,Solution
0,1,1
1,2,0
2,3,1
3,4,0
4,5,0
5,6,0
6,7,0
7,8,1
8,9,1
9,10,0
