In [1]:
import gensim
import math
import nltk
import numpy as np
import operator
import pandas as pd
import pickle
import pydotplus
import random
import re
import seaborn as sns
import string
import sys
import time
import warnings
import zipfile

from collections import Counter

from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.models.lsimodel import LsiModel

from gensim.utils import simple_preprocess
from gensim.utils import simple_preprocess

from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

from io import StringIO

from IPython.display import Image

from matplotlib import pyplot as plt
from matplotlib import colors as mcolors
from matplotlib import ticker

from mlxtend.classifier import StackingClassifier

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from nltk.stem.wordnet import WordNetLemmatizer

from pprint import pprint

import pyLDAvis
import pyLDAvis.gensim

from scipy import stats
from scipy.cluster import hierarchy as sch

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.datasets import make_moons
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import precision_score
from sklearn.metrics import precision_recall_fscore_support as error_metric
from sklearn.metrics import r2_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree

from string import punctuation

from textblob import TextBlob

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from wordcloud import WordCloud
from wordcloud import STOPWORDS

from xgboost import XGBClassifier
from xgboost import XGBRegressor

warnings.filterwarnings('ignore')
np.set_printoptions(threshold=sys.maxsize)

In [2]:
def log_transform(value):
    return np.log(value) if value > 0 else 0

def sqrt_transform(value):
    return np.sqrt(value)

def boxcox_transform(array):
    index_temp = list(array.index).copy()
    array_temp = array
    array_temp = np.where(array_temp == 0, 0.000000001, array_temp)
    return pd.Series(data=stats.boxcox(array_temp)[0],
                    index=index_temp) 

def check_skewness(array):
    sns.distplot(array)
    print(stats.skew(array))
    
def show_transformed(array):
    array_log = array
    array_log = array_log.apply(log_transform)
    print("Log skew: {0}".format(stats.skew(array_log)))
    sns.distplot(array_log)
    plt.show()
    
    array_sqrt = array
    array_sqrt = array_sqrt.apply(sqrt_transform)
    print("Sqrt skew: {0}".format(stats.skew(array_sqrt)))
    sns.distplot(array_sqrt)
    plt.show()
    
    array_boxcox = boxcox_transform(array)
    print("Boxcox skew: {0}".format(stats.skew(array_boxcox)))
    sns.distplot(array_boxcox)
    plt.show()

In [3]:
data = pd.read_csv("processed data/data.csv", index_col=0)

In [4]:
data.drop(columns=['Name', 'SibSp', 'Parch', 'Ticket', 
                   'Fare', 'Cabin', 'Embarked', 'Title', 
                   'Title for Age', 'Ticket Prefix', 
                   'Deck'], axis=1, inplace=True)

In [5]:
data.isnull().sum().sort_values(ascending=False)

Survived              418
Title Social            0
Family Size             0
CD2LF Distr             0
CD2LF Pop               0
CDensity                0
Cabin Shared With       0
Fare Individual         0
Ticket Shared With      0
Age                     0
Sex                     0
Pclass                  0
dtype: int64

In [6]:
data['Age'] = pd.Series( boxcox_transform( data['Age'].astype(int) )).copy()

In [7]:
data['Fare Individual'] = data['Fare Individual'].apply(lambda x: round(x, ndigits=2))
data['Fare Individual'] = pd.Series(boxcox_transform(data['Fare Individual']))

In [8]:
data['Cabin Shared With'] = data['Cabin Shared With'].map({1: 1,
                                                          2: 2,
                                                          3: 3,
                                                          4: 3,
                                                          5: 3,
                                                          6: 4,
                                                          7: 4})

In [9]:
data['Family Size'] = data['Family Size'].map({1: 1,
                                                2: 2,
                                                3: 2,
                                                4: 3,
                                                5: 4,
                                                6: 4,
                                                7: 4,
                                                8: 4,
                                                11: 4})

data['Ticket Shared With'] = data['Ticket Shared With'].apply(lambda x: 5 if x not in [1, 2, 3, 4] else x)

In [10]:
data = pd.get_dummies(data, columns=['Pclass', 'Sex', 'Title Social', 'Ticket Shared With', 'Cabin Shared With', 'Family Size'])

In [11]:
data.head()

Unnamed: 0_level_0,Age,Survived,Fare Individual,CDensity,CD2LF Pop,CD2LF Distr,Pclass_1,Pclass_2,Pclass_3,Pclass_4,...,Ticket Shared With_4,Ticket Shared With_5,Cabin Shared With_1.0,Cabin Shared With_2.0,Cabin Shared With_3.0,Cabin Shared With_4.0,Family Size_1,Family Size_2,Family Size_3,Family Size_4
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,11.378487,0.0,2.71541,2.373239,2.553812,2.178697,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,17.49822,1.0,6.448082,0.776442,1.058785,2.704327,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,13.00195,1.0,2.878758,2.373239,2.553812,2.178697,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,16.41759,1.0,5.615653,0.776442,1.058785,2.704327,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
5,16.41759,0.0,2.909327,3.1,2.980769,1.953125,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [13]:
data.to_csv('processed data/data_for_training.csv')