In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.plyplot as plt
%matplotlib inline
import math
import os #for importing csv file from any location

titanic_data=pd.read_csv("/home/ashleymuoki/Downloads/titanic.csv")
titanic_data.head(10)

In [None]:
#Show the number of passengers 
print("The number of passengers in original data:" +str(len(titanic_data.index)))


# Analyze the data 

In [None]:
sns.countplot(x="Survived", data=titanic_data)

In [None]:
#Those who survived in terms of gender
sns.countplot(x="survived", hue="Sue", data = titanic_data)

In [None]:
#identify the passengers' travelling class
sns.countplot(x="Survived", hue="Pclass", data = titanic_data)


In [None]:
#plot age distribution
#bin is to make it more clear and figsize is to set the size of the graph
titanic_data["Age"].plot.hist()


In [None]:
#plot fare size 
titanic_data["Fare"].plot.hist(bin = 2= , figsize=(10,5))


In [None]:
#identify columns in the data set
titanic_data.info()

In [None]:
#plot sibling and spouses 
sns.countplot(x="Sibsp", data = titanic_data)

# Data wrangling 

In [None]:
#check for missing data 
titanic_data.isnull()
#false when the value is not null true when the value is null

#adding value that are null in each column
titanic_data.isnull().sum()

In [None]:
#also identify missing values using heatmaps, cmap is for colourcoating 
sns.heatmap(titanic_data.isnull(), yticklabels==False, cmap="viridis")


In [None]:
#to get rid of missing values one can add dummy values or drop the column if there are too many missing values 
sns.boxplot(x= "Pclass", y = "Age", data= titanic_data)


In [None]:
#print the head of the dataset
titanic_data.head(5)


In [None]:
#dropping a column
titanic_data.drop("Column name", axis=1, inplace = True)


In [None]:
#check if dropped successfully 
titanic_data.head(5)

In [None]:
#dropping any empty values
titanic_data.dropna(inplace=True)


In [None]:
sns.heatmap(titanic_data.isnull(), yticklabels=False, cbar= False)


In [None]:
#to check whether it is clean
titanic_data.isnull().sum()

In [None]:
#convert string values to categorical values ie dummy variables  and drop the female column afterwards 
sex = pd.get_dummies(titanic_data['Sex'], drop_first=True)
#display to check if first column dropped 
sex.head(5)


In [None]:
#repeat for the other columns 
embark = pd.get_dummies(titanic_data['Embarked'], drop_first=True)
#display to check if first column dropped 
embark.head(5)

In [None]:
pclass = pd.get_dummies(titanic_data['Pclass'], drop_first=True)
#display to check if first column dropped 
pclass.head(5)


In [None]:
#concatenate the columns into a new dataset
titanic_data = pd.concat([titanic_data,sex,embark, pclass], axis=1)
titanic_data.head(5)

In [None]:
#drop unnnecessary columns
titanic_data.drop(['Sex','Embarked','Pclass','Passenger_id', 'Name', 'Ticket', axis= 1 , inplace= True])
titanic_data.head(5)


# Train data 

In [None]:
#define the independent variable and dependent variable 
x = titanic_data.drop("Survived", axis=1)
y = titanic_data["Survived"]


In [None]:
#split the data 
from sklearn.crossvalidation import train_test_split

In [None]:
x_train,x_test, y_train, y_test = train_test_split(x, y, test_size= 0.3 , random_state = 1)

In [None]:
#create the model
from sklearn.linear_model import LogisticRegression

In [None]:
logmodel =  LogsiticRegression()

In [None]:
#fit data into the model
logmodel.fit(x_train ,y_train)


In [None]:
#make predictions
predictions = logmodel.predict(x_test)

In [None]:
#checkin3accuracy using classification report
from sklearn.metrics import classification_report


In [None]:
Classification_report(y_test, predictions)

In [None]:
#calculate accuracy using the confusion matrix 
from sklearn.metrics import confusion_matrix


In [None]:
confusion_matrix(y_test, predictions)

In [None]:
#calculate accuracy using  accuracy_score
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

#SUV predictions

In [None]:
import numpy as np
import matplotlib as plt
import pandas as pd
%matplotlib inline

In [None]:
dataset=pd.read_csv("SUV predictions.csv")
#display data
dataset.head(10)

In [None]:
#define independent variable and dependent variable 
x = dataset.iloc[:, [2,3]].values
y= dataset.iloc[:,4].values


In [None]:
#train data
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = (x, y, test_size = 0.25, random_state=0)


In [None]:
#Scale input values 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)


In [None]:
#create the model
from sklearn .linear_model import logisticRegression

In [None]:
classifier = LogisticRegression(random_state= 0)
#fit data
classifier.fit(x_train,y_train)

In [None]:
#make predictions
y_pred = classifier.predict(x_test)

In [None]:
#check accuracy 
 from slkearn.metrics import accuracy_score
    accuracy_score(y_test,y_pred)*100
    