In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import re
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [3]:
'''
Function: impute_mode_age
Purpose: Impute missing value with mode(seperate by sex)
Parameter: 
  df_in: Input DataFrame
Return:
  DataFrame after impute missing value
#'''

'\nFunction: impute_mode_age\nPurpose: Impute missing value with mode(seperate by sex)\nParameter: \n  df_in: Input DataFrame\nReturn:\n  DataFrame after impute missing value\n#'

In [4]:
def impute_mode_age(df_in):
  from scipy import stats
  a = np.array(df_in.groupby(['Sex'])['Age'])
  female_mode = int(stats.mode(a[0][1])[0])
  male_mode = int(stats.mode(a[1][1])[0])
  df_in = df_in.reset_index(drop = True)
  male_df = df_in[df_in['Sex'] == 1]
  female_df = df_in[df_in['Sex'] == 0]
  male_df = male_df.fillna(male_mode)
  female_df = female_df.fillna(female_mode)
  return pd.concat([male_df,female_df]).sort_index()

In [5]:
'''
Function: impute_gbr_age
Purpose: Impute missing value using Gradient Boot Regression to predict
Parameter: 
  df_in: Input DataFrame
Return:
  DataFrame after impute missing value
#'''

'\nFunction: impute_gbr_age\nPurpose: Impute missing value using Gradient Boot Regression to predict\nParameter: \n  df_in: Input DataFrame\nReturn:\n  DataFrame after impute missing value\n#'

In [6]:
def impute_gbr_age(df_in):
  from sklearn.ensemble import GradientBoostingRegressor
  test = df_in[df_in['Age']!=df_in['Age']]
  train = df_in[df_in['Age']==df_in['Age']]
  X_train = train.drop(['Age'],axis=1)
  y_train = train['Age']
  X_test = test.drop('Age',axis=1)
  reg = GradientBoostingRegressor(random_state=0)
  reg.fit(X_train,y_train)
  y_pred = reg.predict(X_test)
  X_test['Age'] = y_pred
  return pd.concat([train,X_test]).sort_index()

In [7]:
'''
Function: impute_gbr_age
Purpose: Impute missing value using Linear Regression to predict
Parameter: 
  df_in: Input DataFrame
Return:
  DataFrame after impute missing value
#'''

'\nFunction: impute_gbr_age\nPurpose: Impute missing value using Linear Regression to predict\nParameter: \n  df_in: Input DataFrame\nReturn:\n  DataFrame after impute missing value\n#'

In [8]:
def impute_linear_age(df):
  from sklearn.linear_model import LinearRegression
  test = df[df['Age']!=df['Age']]
  train = df[df['Age']==df['Age']]
  X_train = train.drop(['Age'],axis=1)
  y_train = train['Age']
  X_test = test.drop('Age',axis=1)
  reg = LinearRegression()
  reg.fit(X_train,y_train)
  y_pred = reg.predict(X_test)
  X_test['Age'] = y_pred
  return pd.concat([train,X_test]).sort_index()

In [9]:
'''
Function: LE_columns
Purpose: Label Encoder columns with categorical data
Parameter: 
  df_in: Input DataFrame
Return:
  DataFrame after encoder
#'''

'\nFunction: LE_columns\nPurpose: Label Encoder columns with categorical data\nParameter: \n  df_in: Input DataFrame\nReturn:\n  DataFrame after encoder\n#'

In [10]:
def LE_columns(df):
  columns = ['Name','Sex','Cabin','Embarked']
  for col in columns:
    if (not isinstance(df[col],(int,float))):
      df[col] = LabelEncoder().fit_transform(df[col])

In [11]:
'''
Function: balancing_data
Purpose: Handle imbalanced data
Parameter: 
  df_in: Input DataFrame
Return:
  DataFrame after handling imbalanced data
#'''

'\nFunction: balancing_data\nPurpose: Handle imbalanced data\nParameter: \n  df_in: Input DataFrame\nReturn:\n  DataFrame after handling imbalanced data\n#'

In [12]:
def balancing_data(X_train,Y_train):
  import imblearn
  from imblearn.over_sampling import SMOTE
  tmp = Y_train.value_counts(normalize=True)
  #if (tmp.min()>0.2):
    #return
  sm = SMOTE(random_state=2)
  X_train, Y_train = sm.fit_sample(X_train, Y_train.ravel())

In [13]:
'''
Function: RFCaccuracy
Purpose: Find Random Forest model's f1 score
Parameter: 
  df_in: Input DataFrame
Return:
  F1 score
#'''

"\nFunction: RFCaccuracy\nPurpose: Find Random Forest model's f1 score\nParameter: \n  df_in: Input DataFrame\nReturn:\n  F1 score\n#"

In [14]:
def RFCaccuracy(X_train,X_test,Y_train,Y_test = []):
  import time
  from sklearn.ensemble import RandomForestClassifier
  from sklearn.metrics import f1_score
  start = time.time()
  clf = RandomForestClassifier()
  clf.fit(X_train,Y_train)
  Y_predict = clf.predict(X_test)
  end = time.time()
  if (not Y_test==[]):
    score = f1_score(Y_test,Y_predict)
    print("Train set f1 score for Random Forest Classifier: ", f1_score(Y_train,clf.predict(X_train)))
    print("Test set f1 score for Random Forest Classifier: ", score)
    print("Time estimated: ",end-start)
    return score
  else:
    return Y_predict

In [15]:
'''
Function: LRaccuracy
Purpose: Find Logistic Regression model's f1 score
Parameter: 
  df_in: Input DataFrame
Return:
  F1 score
#'''

"\nFunction: LRaccuracy\nPurpose: Find Logistic Regression model's f1 score\nParameter: \n  df_in: Input DataFrame\nReturn:\n  F1 score\n#"

In [16]:
def LRaccuracy(X_train,X_test,Y_train,Y_test):
  import time
  from sklearn.linear_model import LogisticRegression
  from sklearn.metrics import f1_score
  start = time.time()
  clf = LogisticRegression(max_iter = X_train.shape[0]).fit(X_train,Y_train)
  Y_predict = clf.predict(X_test)
  score = f1_score(Y_test,Y_predict)
  end = time.time()
  print("Train set f1 score for Logistic Regression: ", f1_score(Y_train,clf.predict(X_train)))
  print("Test set f1 score for Logistic Regression: ", score)
  print("Time estimated: ",end-start)
  return score

In [17]:
'''
Function: SVCaccuracy
Purpose: Find Suport Vector Machine model's f1 score
Parameter: 
  df_in: Input DataFrame
Return:
  F1 score
#'''

"\nFunction: SVCaccuracy\nPurpose: Find Suport Vector Machine model's f1 score\nParameter: \n  df_in: Input DataFrame\nReturn:\n  F1 score\n#"

In [18]:
def SVCaccuracy(X_train,X_test,Y_train,Y_test):
  import time
  from sklearn import svm
  from sklearn.metrics import f1_score
  start = time.time()
  clf = svm.SVC()
  clf.fit(X_train,Y_train)
  Y_predict = clf.predict(X_test)
  score = f1_score(Y_test,Y_predict)
  end = time.time()
  print("Train set f1 score for Support Vector Classification: ", f1_score(Y_train,clf.predict(X_train)))
  print("Test set f1 score for Support Vector Classification: ", score)
  print("Time estimated: ",end-start)
  return score

In [19]:
'''
Function: DTCaccuracy
Purpose: Find Decision Tree model's f1 score
Parameter: 
  df_in: Input DataFrame
Return:
  F1 score
#'''

"\nFunction: DTCaccuracy\nPurpose: Find Decision Tree model's f1 score\nParameter: \n  df_in: Input DataFrame\nReturn:\n  F1 score\n#"

In [20]:
def DTCaccuracy(X_train, X_test,Y_train,Y_test = []):
  import time
  from sklearn import tree
  from sklearn.metrics import f1_score
  start = time.time()
  clf = tree.DecisionTreeClassifier()
  clf.fit(X_train,Y_train)
  Y_predict = clf.predict(X_test)
  score = f1_score(Y_test,Y_predict)
  end = time.time()
  print("Train set f1 score for Decision Tree Classification: ", f1_score(Y_train,clf.predict(X_train)))
  print("Test set f1 score for Decision Tree Classification: ", score)
  print("Time estimatimated: ",end-start)
  return score

In [21]:
'''
Purpose: Replace null values with most frequency value
Parameters: 
  df_in: Dataframe input
  list_columns: List of columns need to detect missing value and replace
Returns:
  Dataframe after replace null values with most frequency value
#'''

'\nPurpose: Replace null values with most frequency value\nParameters: \n  df_in: Dataframe input\n  list_columns: List of columns need to detect missing value and replace\nReturns:\n  Dataframe after replace null values with most frequency value\n#'

In [22]:
def impute_mode_value(df_in, list_columns):
  df_out = df_in.copy()
  count_missing = 0
  for column in list_columns:
    try:
      count_missing = count_missing + df_out[column].isnull().sum()
      #print("Impute {0} missing values from {1} column".format(df_out[column].isnull().sum(),column))
      mode_value = df_out[column].mode()[0]
      df_out[column].fillna(mode_value,inplace = True)
    except:
      print("Some thing error with column {0}".format(column))
  #print("There are {0} values have been imputed".format(count_missing))
  return df_out

In [23]:
'''
Function: feature_engine
Purpose: Handle feature(Create feature, Delete feature, Label encoding, Handle 
         missing data)
Parameter: 
  df_in: Input DataFrame
Return:
  DataFrame after handle feature
#'''

'\nFunction: feature_engine\nPurpose: Handle feature(Create feature, Delete feature, Label encoding, Handle \n         missing data)\nParameter: \n  df_in: Input DataFrame\nReturn:\n  DataFrame after handle feature\n#'

In [24]:
def feature_engine(df_in):
  df_out = df_in.copy()
  df_out = df_out.drop(['PassengerId','Ticket'],axis = 1)
  df_out['FamilySize'] = df_out['SibSp']+df_out['Parch']
  df_out['FarePerPerson'] = df_out['Fare']/(df_out['FamilySize']+1)
  df_out = df_out.drop(['Fare'],axis = 1)
  df_out['Cabin'] = np.where(df_out['Cabin']!=df_out['Cabin'],' ',df_out['Cabin'])
  df_out['Cabin'] = [s[0] for s in df_out['Cabin']]
  df_out['Name'] = [re.findall(', ([^ ]*). ',s)[0] for s in df_out['Name']]
  df_out = impute_mode_value(df_out, df_out.columns.drop("Age"))
  LE_columns(df_out)
  '''
  for i in df_out.columns:
    print(df_out[i].value_counts())
    print(df_out[i].isnull().sum())
    print("-"*30)
  #'''
  #df_out = impute_mode_age(df_out)
  df_out = impute_gbr_age(df_out)
  #df_out = impute_linear_age(df_out)
  return df_out

In [25]:
def main():
  path = '/content/drive/My Drive/Datasets/Py4DS_Lab5/titanic_train.csv'
  df_raw = pd.read_csv(path)
  df = feature_engine(df_raw)
  '''
  # Test code
  X = df.drop(['Survived'],axis = 1)
  Y = df['Survived']
  X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2))
  balancing_data(X_train,Y_train)
  return RFCaccuracy(X_train,X_test,Y_train,Y_test)
  #'''
  # Product
  path_test = '/content/drive/My Drive/Datasets/Py4DS_Lab5/titanic_test.csv'
  df_test_raw = pd.read_csv(path_test)
  df_test = feature_engine(df_test_raw)
  X_train = df.drop(['Survived'],axis = 1)
  Y_train = df['Survived']
  X_test = df_test
  Y_pred = RFCaccuracy(X_train,X_test,Y_train)
  df_test_raw['Survived'] = Y_pred
  path_predict = '/content/drive/My Drive/Datasets/Py4DS_Lab5/titanic_predict.csv'
  df_test_raw.to_csv(path_predict,index = False)
  display(df_test_raw)
  #'''

In [26]:
main()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,1
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,0


In [27]:
'''
# Test code
a = []
for i in range(100):
  a.append(main())
a = np.array(a)
print(np.mean(a))
print(a.min())
print(a.max())
#'''

'\n# Test code\na = []\nfor i in range(100):\n  a.append(main())\na = np.array(a)\nprint(np.mean(a))\nprint(a.min())\nprint(a.max())\n#'