In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
#Loading Training Data
data=pd.read_csv('train.csv')
data.head()

In [None]:
#Loading Test CSV
Test_data=pd.read_csv('test.csv')
Test_data.head()

In [None]:
#Number of unique values in each column
data.nunique()

In [None]:
Test_data.nunique()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
Test_data.info()

In [None]:
#find missing values
print('data:',data.isnull().sum(),'\n')
print('Test_data:',Test_data.isnull().sum())

In [None]:
data.dtypes

In [None]:
Test_data.dtypes

In [None]:
#converting CryoSleep and VIP columns to boolean dtype on Train data
data['CryoSleep']=data['CryoSleep'].astype(bool)
data['VIP']=data['VIP'].astype(bool)
data.dtypes

In [None]:
#converting CryoSleep and VIP columns to boolean dtype on Test data
Test_data['CryoSleep']=Test_data['CryoSleep'].astype(bool)
Test_data['VIP']=Test_data['VIP'].astype(bool)
Test_data.dtypes

In [None]:
#selecting only categorical and binary columns
cat_cols=data.drop(['PassengerId','Cabin','Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Name'],axis=1)
cat_cols

In [None]:
#categorical columns Unique values
for i in cat_cols:
    print('{}'.format(i), ': ',cat_cols['{}'.format(i)].unique())

In [None]:
#categorical columns Value counts of each unique values
for i in cat_cols:
    print('')
    print('{}:'.format(i))
    print(cat_cols['{}'.format(i)].value_counts())

In [None]:
#Selecting only numerical/continuous columns
num_cols=cat_cols=data.select_dtypes(include=['float64','int64'])
num_cols

In [None]:
#Statistics of num_cols
for i in num_cols:
    print('')
    print('{}'.format(i))
    print(num_cols['{}'.format(i)].describe())

In [None]:
#data correlation
corr=data.corr()
sns.heatmap(corr,cmap='hot')

In [None]:
data.columns

In [None]:
data.isnull().sum()

# Filling Missing Data 

In [None]:
data['HomePlanet']=data['HomePlanet'].fillna(data['HomePlanet'].mode().iloc[0])
data['Destination']=data['Destination'].fillna(data['Destination'].mode().iloc[0])

In [None]:
data.isnull().sum()

In [None]:
data.info()

In [None]:
#Removing Passengerid and Name columns for Train data
data=data.drop(['PassengerId', 'Name'],axis=1)

In [None]:
#Removing Passengerid and Name columns for Test data
Test_data=Test_data.drop(['PassengerId', 'Name'],axis=1)

# Experimenting

In [None]:
#***Logistic Regression***

In [None]:
#try label encoding for Train Data
label_encoder = preprocessing.LabelEncoder()
data['HomePlanet']= label_encoder.fit_transform(data['HomePlanet']) 
print(data['HomePlanet'].value_counts(),'\n')
data['Destination']= label_encoder.fit_transform(data['Destination']) 
print(data['Destination'].value_counts())

In [None]:
#try label encoding for Test Data
Test_data['HomePlanet']= label_encoder.fit_transform(Test_data['HomePlanet']) 
print(Test_data['HomePlanet'].value_counts(),'\n')
Test_data['Destination']= label_encoder.fit_transform(Test_data['Destination']) 
print(Test_data['Destination'].value_counts())

In [None]:
data=data.drop(['Cabin'],axis=1) #Remove Cabin column for Train Data
rem_mv=data.dropna() #Removed any row with missing values for Train Data
print('Train Data')
print('data:',len(data),'rem_mv:',len(rem_mv))

In [None]:
Test_data=Test_data.drop(['Cabin'],axis=1) #Remove Cabin column for Test Data
Fin_Test_data=Test_data.dropna() #Removed any row with missing values for Test Data
print('Test Data')
print('Test_data:',len(Test_data),'Fin_Test_data:',len(Fin_Test_data))

In [None]:
#X=rem_mv.select_dtypes(include=['number','bool']).iloc[0:,0:8]
X=rem_mv.iloc[0:,0:10]
X

In [None]:
y=rem_mv.iloc[0:,-1]
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train,y_train)

In [None]:
y_pred=logreg.predict(X_test)

In [None]:
y_pred

In [None]:
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("F1",metrics.f1_score(y_test,y_pred))

In [None]:
y_pred_proba = logreg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
#ypred try on test.csv Data
y_pred_Test=logreg.predict(Fin_Test_data)

In [None]:
y_pred_Test

In [None]:
len(y_pred)