In [129]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition  import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [130]:
titanic = pd.read_csv("train.csv")

In [131]:
cols_to_drop=["PassengerId","Name","Ticket","Cabin","Fare","Embarked"]

In [132]:
titanic= titanic.drop(cols_to_drop,axis=1)

In [133]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
dtypes: float64(1), int64(4), object(1)
memory usage: 41.8+ KB


In [134]:
le= LabelEncoder()
titanic.Sex=le.fit_transform(titanic.Sex)

In [135]:
titanic['Age'].fillna(titanic["Age"].mean(),inplace=True)

In [136]:
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
0,0,3,1,22.0,1,0
1,1,1,0,38.0,1,0
2,1,3,0,26.0,0,0
3,1,1,0,35.0,1,0
4,0,3,1,35.0,0,0


In [205]:
X=titanic.drop(["Survived"],axis=1)
y=titanic.Survived

In [206]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [207]:
from sklearn.neighbors import KNeighborsClassifier

In [208]:
model=KNeighborsClassifier(5)

In [209]:
model.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [210]:
model.score(X_test,y_test)

0.7559322033898305

In [211]:
from sklearn.linear_model import LogisticRegression

In [212]:
models=LogisticRegression()

In [213]:
models.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [214]:
models.score(X_test,y_test)

0.8135593220338984

In [215]:
def entropy(col):
    items,counts=np.unique(col.values,return_counts=True)
    size = col.values.shape[0]
    acc=0
    for count in counts:
        pi = count/size
        acc += (pi* np.log2(pi))
        
    return -acc

In [216]:
vals = pd.Series([1,1,1,0,0,0])
print(entropy(vals))

1.0


In [217]:
def info_gain(X,Y,label):
    mean = np.mean(X[label])
    
    
    left_y = Y[X[label]<mean]
    right_y = Y[X[label]>=mean]
    
    s_total = Y.shape[0]
    s_left = left_y.shape[0]
    s_right = right_y.shape[0]
    
    if left_y.shape[0] == 0 or right_y.shape[0] == 0:
        return -10000
    
    return entropy(Y) - ((s_left/s_total)*entropy(left_y) + (s_right/s_total)*entropy(right_y))
    
    
    
    
    

In [218]:
for col in X_train.columns:
    print(col, info_gain(X_train,y_train,col))

Pclass 0.0693612979550764
Sex 0.20332074264151545
Age 0.0016815830685426025
SibSp 0.004524587061436547
Parch 0.014104142987097612


In [219]:
class Node:
    def __init__(self,label=None, value= None,result=None):
        self.label=label
        self.value=value
        self.result=result
        
        self.left = None
        self.right = None
    

In [220]:
class DecisionTree:
    
    def __init__(self):
        self.root= None
        
    def best_col(self,x_data,y_data):
        gains = []
        
        for col in x_data.columns:
            gains.append((col, info_gain(x_data, y_data, col)))
        gains= sorted(gains, key=lambda a:a[1])
        
        return gains[-1][0]
        
        
    def generate(self, x_data,y_data, max_depth=5):
        
        if max_depth == 0:
            #mean
            return Node(result=np.mean(y_data))
        
        best = self.best_col(x_data,y_data)
        mean = np.mean(x_data[best])
        
        left_x = x_data[x_data[best]<mean]
        right_x = x_data[x_data[best]>=mean]
        
        left_y = y_data[x_data[best]<mean]
        right_y = y_data[x_data[best]>=mean]
        
        if left_y.shape[0] == 0 or right_y.shape[0] == 0:
            return Node(result=np.mean(y_data))
        
        node = Node(label=best, value=mean)
        
        node.left = self.generate(left_x,left_y,max_depth-1)
        node.right = self.generate(right_x,right_y,max_depth-1)
        
        return node
    
    def display(self, node, indent=0):
        if node ==None:
            return
    
        print("\t"*indent,node.label, node.value, node.result)
        self.display(node.left,indent+1)
        self.display(node.right,indent+1)
        
        
    def predict(self,x_data):
        res=[]
        for data in x_data.iterrows():
            val = self.find(data[1],self.root)
            if val > .5:
                res.append(1)
                
            else:
                res.append(0)
                
            return res
                
                
    def find(self,data,node):
        if node.label ==None:
            return node.result
        
        if (data[node.label]<node.value):
            
            return self.find(data,node.left)
        
        else:
            return self.find(data, node.right)
                
            
    def score(self,x_data,y_data):
        pred = self.predict(x_data)
        
        return sum(pred == y_data) / y_data.shape[0]

                
            
        
        
        
    

In [221]:
tree = DecisionTree()
tree.root = tree.generate(X_train,y_train)

In [226]:
tree.root = tree.generate(X_train,y_train,8)

In [227]:
tree.root.label

'Sex'

In [228]:
#tree.display(tree.root)

In [229]:
print(tree.score(X_test,y_test))

0.5932203389830508
