In [525]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

Folosesc interquartile range pentru a identifica si elimina outlier-ele. Outlierii sunt cei cu valori mai mici decat Q1 - 1.5IQR sau mai mare decat Q3 + 1.5IQR, unde Q1 si Q3 sunt primul si al treilea percentile, iar IQR este diferenta dintre Q3 si Q1. Valorile care sunt în afara intervalului Q1 - 1.5IQR si Q3 + 1.5IQR sunt eliminate.

In [75]:
def iqr(df):
  ############################################
              # Outlieri varsta #
  ############################################

  # calculeaza primul percentile
  ageQ1 = np.percentile(df['Age'], 25)
  # calculeaza al treilea percentile
  ageQ3 = np.percentile(df['Age'], 75)
  # interquartile range
  ageIQR = ageQ3 - ageQ1

  # valoarea sub care se afla outlieri
  ageMinValue = ageQ1 - 1.5*ageIQR
  # valoarea peste care se afla outlieri
  ageMaxValue = ageQ3 + 1.5*ageIQR

  # eliminare outlieri
  df_age_iqr = df[(df['Age'] >= ageMinValue) & (df['Age'] <= ageMaxValue)]

  df_age_iqr.reset_index(drop=True, inplace=True)

  ############################################
              # Outlieri pret #
  ############################################

  # calculeaza primul percentile
  fareQ1 = np.percentile(df['Fare'], 25)
  # calculeaza al treilea percentile
  fareQ3 = np.percentile(df['Fare'], 75)
  # interquartile range
  fareIQR = fareQ3 - fareQ1

  # valoarea sub care se afla outlieri
  fareMinValue = fareQ1 - 1.5*fareIQR
  # valoarea peste care se afla outlieri
  fareMaxValue = fareQ3 + 1.5*fareIQR

  # eliminare outlieri
  df_fare_iqr = df_age_iqr[(df_age_iqr['Fare'] >= fareMinValue) & (df_age_iqr['Fare'] <= fareMaxValue)]

  df_fare_iqr.reset_index(drop=True, inplace=True)

  return df_fare_iqr

Calculez Z-score pentru fiecare observatie si elimin valorile care au un Z-score absolut mai mare decât un anumit prag. Z-score este numarul de deviatii standard fata de media setului de date. In cazul lui Age, valorile cu un Z-score mai mare de 1.9 sunt considerate outlier-e. Pentru Fare se elimina valorile cu Z-score mai mare ca 0.7

In [455]:
def z_score(df):
  ############################################
              # Outlieri varsta #
  ############################################

  age_zscores = stats.zscore(df['Age'])

  # cel mai asemanator cu iqr este pt 1.9
  df_age_zscore = df[abs(age_zscores) <= 1.9]

  df_age_zscore.reset_index(drop=True, inplace=True)

  ############################################
              # Outlieri pret #
  ############################################

  fare_zscores = stats.zscore(df_age_zscore['Fare'])

  # cel mai asemanator cu iqr este pt 0.7
  df_fare_zscore = df_age_zscore[abs(fare_zscores) <= 0.7]

  df_fare_zscore.reset_index(drop=True, inplace=True)

  return df_fare_zscore

Realizare grafice relevante:

In [77]:
def data_analysis(df):

# df.info()
# print(df['Embarked'].value_counts())

  plt.figure(figsize=(10, 3))
  df = df.dropna(subset='Age')
  #corelatie varsta-supravietuire (barbati)
  plt.subplot(1, 2, 1)
  df_male = df[df['Sex'] == 'male']
  survived = df_male[df_male['Survived'] == 1]['Age']
  not_survived = df_male[df_male['Survived'] == 0]['Age']
  plt.hist(survived, bins=30, alpha=0.5, color='yellow', label='Survived')
  plt.hist(not_survived, bins=30, alpha=0.5, color='blue', label='Not Survived')
  plt.xlabel('Age')
  plt.title('Age-Survived Male')

  #corelatie varsta-supravietuire (femei)
  plt.subplot(1, 2, 2)
  df_female = df[df['Sex'] == 'female']
  survived = df_female[df_female['Survived'] == 1]['Age']
  not_survived = df_female[df_female['Survived'] == 0]['Age']
  plt.hist(survived, bins=30, alpha=0.5, color='yellow', label='Survived')
  plt.hist(not_survived, bins=30, alpha=0.5, color='blue', label='Not Survived')
  plt.xlabel('Age')
  plt.title('Age-Survived Female')
  plt.legend()

  #corelatie pret_bilet-supravietuire
  plt.figure(figsize=(5, 3))
  survived = df[df['Survived'] == 1]['Fare']
  not_survived = df[df['Survived'] == 0]['Fare']
  plt.hist(survived, bins=30, alpha=0.5, color='yellow', label='Survived')
  plt.hist(not_survived, bins=30, alpha=0.5, color='blue', label='Not Survived')
  plt.xlabel('Fare')
  plt.title('Fare-Survived')
  plt.legend()

  # corelatie clasa-varsta pentru supravietuire
  grid = sns.FacetGrid(df, col='Survived', row='Pclass', height=2, aspect=1.5, palette='pastel')
  grid.map(sns.histplot, 'Age', kde=False, bins=30)

  # corelare numar de rude la bord si supravietuire
  if 'Relatives' in df.columns:
    plt.figure(figsize=(10, 3))
    plt.subplot(1, 2, 1)
    sns.barplot(x='Relatives', y='Survived', data=df)

    plt.subplot(1, 2, 2)
    plt.hist(df['Relatives'], bins=30, alpha=1)
    plt.xlabel('Relatives')
    plt.show()

Prelucrare date si antrenare cu RandomForest:

In [535]:
#citeste fisier train.csv
df_train = pd.read_csv('train.csv')

# afisare date relevante despre setul de date inainte de orice prelucrare
# data_analysis(df_train)

#############################################
 #Pregatirea setului de date de antrenament#
#############################################

# eliminarea din setul de date de intrare a coloanelor 'PassengerId', 'Name', 'Ticket', 'Cabin'
df_train.drop(['PassengerId', 'Ticket'], axis='columns', inplace=True)

# crearea coloanei Deck si eliminarea coloanei Cabin
df_train['Cabin'].fillna("N0", inplace=True)
df_train['Deck'] = df_train['Cabin'].str.extract('([A-Za-z]+)', expand=False)
df_train.drop(['Cabin'], axis='columns', inplace=True)

# extragerea formulei de adresare din nume
df_train['Name'] = df_train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
df_train['Name'] = df_train['Name'].replace('Mlle', 'Miss')
df_train['Name'] = df_train['Name'].replace('Ms', 'Miss')
df_train['Name'] = df_train['Name'].replace('Mme', 'Mrs')
freq = df_train['Name'].value_counts()
freq = freq.to_dict()
df_train['Name'] = df_train['Name'].apply(lambda x: 'Rare' if freq[x] < 10 else x)

# obtinerea coloanelor relatives si alone
relatives = df_train['SibSp'] + df_train['Parch']
df_train['Relatives'] = relatives

# completarea valorilor goale pt varsta cu media
mean = np.mean(df_train['Age'])
df_train['Age'].fillna(mean, inplace=True)

# completarea valorilor goale pentru imbarcari cu cea mai intalnita valoare
mean = df_train['Embarked'].mode()[0]
df_train['Embarked'].fillna(mean, inplace=True)

# eliminare outlieri dupa varsta si pret
df_train = z_score(df_train)

# transforma Sex in coloana numerica
le = LabelEncoder()
le.fit(df_train['Sex'])
df_train.loc[:, 'Sex'] = le.transform(df_train['Sex'])

# transforma Embarked in coloana numerica
le = LabelEncoder()
le.fit(df_train['Embarked'])
df_train.loc[:, 'Embarked'] = le.transform(df_train['Embarked'])

# transforma Name in coloana numerica
le = LabelEncoder()
le.fit(df_train['Name'])
df_train.loc[:, 'Name'] = le.transform(df_train['Name'])

# transforma Deck in coloana numerica
le = LabelEncoder()
le.fit(df_train['Deck'])
df_train.loc[:, 'Deck'] = le.transform(df_train['Deck'])

# discretizarea datelor / impartirea pe intervale
df_train.loc[df_train['Age'] <= 19, 'Age'] = 0
df_train.loc[(df_train['Age'] > 19) & (df_train['Age'] <= 25), 'Age'] = 1
df_train.loc[(df_train['Age'] > 25) & (df_train['Age'] <= 29.699), 'Age'] = 2
df_train.loc[(df_train['Age'] > 29.699) & (df_train['Age'] <= 30), 'Age'] = 3
df_train.loc[(df_train['Age'] > 30) & (df_train['Age'] <= 40), 'Age'] = 4
df_train.loc[ df_train['Age'] > 40, 'Age'] = 5
df_train['Age'] = df_train['Age'].astype(int)

df_train.loc[df_train['Fare'] <= 7.775, 'Fare'] = 0
df_train.loc[(df_train['Fare'] > 7.775) & (df_train['Fare'] <= 8.3), 'Fare'] = 1
df_train.loc[(df_train['Fare'] > 8.3) & (df_train['Fare'] <= 13.5), 'Fare'] = 2
df_train.loc[(df_train['Fare'] > 13.5) & (df_train['Fare'] <= 26), 'Fare'] = 3
df_train.loc[(df_train['Fare'] > 26) & (df_train['Fare'] <= 46.9), 'Fare'] = 4
df_train.loc[ df_train['Fare'] > 46.9, 'Fare'] = 5
df_train['Fare'] = df_train['Fare'].astype(int)

# crearea coloanei de corelatie intre varsta si clasa
df_train['Age_Class'] = df_train['Age'] * df_train['Pclass']

# scrierea in "filtered_train.csv" a datelor filtrate
# inainte de normalizare
df_train.to_csv('filtered_train.csv')

# set de date intrare - antrenament
X = df_train.drop(columns=['Survived'])

# normalizarea datelor cu StandardScaler
scaler = StandardScaler()
X_mat = scaler.fit_transform(X)
X = pd.DataFrame(X_mat, columns=X.columns)

# setul de date de iesire - antrenament
y = df_train['Survived']

#############################################
 #Impartirea setului de date pentru testare#
                  #sau#
    #Predictie pentru competitie Kaggle#
#############################################

scop = input("Scop (test sau predictie): ")
if scop == "test":
  # impartire set de date
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\
                                                      shuffle=False)
else:
  X_train = X
  y_train = y

  #############################################
  #Pregatirea setului de date de testare#
  #############################################

  # citeste fisier test.csv
  df_test = pd.read_csv('test.csv')

  # savlarea coloanei de id
  id = df_test['PassengerId']
  df_test.drop(['Ticket', 'PassengerId'], axis='columns', inplace=True)

  df_test['Cabin'].fillna("N0", inplace=True)
  df_test['Deck'] = df_test['Cabin'].str.extract('([A-Za-z]+)', expand=False)
  df_test.drop(['Cabin'], axis='columns', inplace=True)


  df_test['Name'] = df_test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
  df_test['Name'] = df_test['Name'].replace('Mlle', 'Miss')
  df_test['Name'] = df_test['Name'].replace('Ms', 'Miss')
  df_test['Name'] = df_test['Name'].replace('Mme', 'Mrs')
  freq = df_test['Name'].value_counts()
  freq = freq.to_dict()
  df_test['Name'] = df_test['Name'].apply(lambda x: 'Rare' if freq[x] < 10 else x)

  # obtinerea coloanelor relatives si alone
  relatives = df_test['SibSp'] + df_test['Parch']
  df_test['Relatives'] = relatives

  # completarea valorilor goale pt varsta cu media
  mean = np.mean(df_test['Age'])
  df_test['Age'].fillna(mean, inplace = True)

  # completarea valorilor goale pt pret cu media
  mean = np.mean(df_test['Fare'])
  df_test['Fare'].fillna(mean, inplace = True)

  # transforma Sex in coloana numerica
  le = LabelEncoder()
  le.fit(df_test['Sex'])
  df_test.loc[:, 'Sex'] = le.transform(df_test['Sex'])

  # transforma Embarked in coloana numerica
  le = LabelEncoder()
  le.fit(df_test['Embarked'])
  df_test.loc[:, 'Embarked'] = le.transform(df_test['Embarked'])

  # transforma Name in coloana numerica
  le = LabelEncoder()
  le.fit(df_test['Name'])
  df_test.loc[:, 'Name'] = le.transform(df_test['Name'])

  # transforma Deck in coloana numerica
  le = LabelEncoder()
  le.fit(df_test['Deck'])
  df_test.loc[:, 'Deck'] = le.transform(df_test['Deck'])

  # discretizarea datelor
  df_test.loc[df_test['Age'] <= 19, 'Age'] = 0
  df_test.loc[(df_test['Age'] > 19) & (df_test['Age'] <= 25), 'Age'] = 1
  df_test.loc[(df_test['Age'] > 25) & (df_test['Age'] <= 29.699), 'Age'] = 2
  df_test.loc[(df_test['Age'] > 29.699) & (df_test['Age'] <= 30), 'Age'] = 3
  df_test.loc[(df_test['Age'] > 30) & (df_test['Age'] <= 40), 'Age'] = 4
  df_test.loc[ df_test['Age'] > 40, 'Age'] = 5
  df_test['Age'] = df_test['Age'].astype(int)

  df_test.loc[df_test['Fare'] <= 7.775, 'Fare'] = 0
  df_test.loc[(df_test['Fare'] > 7.775) & (df_test['Fare'] <= 8.3), 'Fare'] = 1
  df_test.loc[(df_test['Fare'] > 8.3) & (df_test['Fare'] <= 13.5), 'Fare'] = 2
  df_test.loc[(df_test['Fare'] > 13.5) & (df_test['Fare'] <= 26), 'Fare'] = 3
  df_test.loc[(df_test['Fare'] > 26) & (df_test['Fare'] <= 46.9), 'Fare'] = 4
  df_test.loc[ df_test['Fare'] > 46.9, 'Fare'] = 5
  df_test['Fare'] = df_test['Fare'].astype(int)

  # crearea coloanei de corelatie intre varsta si clasa
  df_test['Age_Class'] = df_test['Age'] * df_test['Pclass']

  # normalizarea datelor cu StandardScaler
  scaler = StandardScaler()
  X_mat = scaler.fit_transform(df_test)
  df_test = pd.DataFrame(X_mat, columns=df_test.columns)

  X_test = df_test

#############################################
  #Antrenare si predictie cu Random Forest#
#############################################
model = RandomForestClassifier(random_state = 3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# importanta fiecarui feature
# importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(model.feature_importances_,3)})
# importances = importances.sort_values('importance',ascending=False).set_index('feature')
# importances.head(15)
# importances.plot.bar()

if scop == "test":
  accuracy = accuracy_score(y_test, y_pred)
  print(f"Acuratete: {accuracy}\n")
  report = classification_report(y_test, y_pred)
  print(f"Raport:\n{report}")
else:
  df_out = pd.DataFrame({
      'PassengerId': id,
      'Survived': y_pred
  })
  df_out.to_csv('prediction.csv', index=False)

Scop (test sau predictie): predictie
