In [3]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib notebook
plt.style.use("seaborn-colorblind")

train = pd.read_csv("/home/scott/Documents/GIT/data-science/kaggle/Titanic/train.csv")
test = pd.read_csv("/home/scott/Documents/GIT/data-science/kaggle/Titanic/test.csv")

In [4]:
# Recon
train.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

### Data Viz

In [5]:
# Here I overlay two histograms on top of another
plt.figure()
plt.hist(train.loc[train.Age.notnull(), 'Age'], bins=25);
plt.hist(train.loc[ ((train.Survived == 1) & (train.Age.notnull()) ), 'Age'], color='yellow', bins=25);
ax = plt.gca();
ax.set_xlabel("Age");
ax.set_ylabel("Population");
ax.legend(['Size of Age cohort', 'Size of Survived']);

<IPython.core.display.Javascript object>

In [6]:
plt.figure(figsize=[6, 4])
plt.scatter(train.Fare, train.Age, marker='.');
ax = plt.gca()
ax.set_xlabel("Ticket price");
ax.set_ylabel("Age");
# Same result, less typing:
# df.plot('Fare', 'Age', kind="scatter")

<IPython.core.display.Javascript object>

In [7]:
# You can also have a heatmap on each scatter plot by passing the c (color) parameter or s (size) parameter:
ax = train.plot.scatter('Fare', 'Age', c='Survived', colormap='viridis', s=train.Fare)
# This is same as plt.gca()
# ax.legend(['legend1'])

# Same as: plt.scatter(train.Fare, train.Age, marker='.', c=train.Survived, s=train.Fare);

<IPython.core.display.Javascript object>

In [8]:
# The same data using a heatmap (2d histogram)

'''
Given a set of ordered pairs describing data points, 
you can count the number of points with similar values to construct a two-dimensional histogram. 
This is similar to a one-dimensional histogram, 
but it describes the joint variation of two random variables rather than just one.
''';

plt.figure(figsize=[6, 4])
plt.hist2d(train.Fare, train.Age.fillna(28), bins=20);
plt.colorbar()
ax = plt.gca()
ax.set_xlabel("Ticket price");
ax.set_ylabel("Age");

<IPython.core.display.Javascript object>

In [9]:
locations = train.groupby("Embarked").agg({'Survived' : np.sum, 'PassengerId' : len }).reset_index()
locations['% survived'] = round(locations.Survived / locations.PassengerId, 2)
locations

Unnamed: 0,Embarked,Survived,PassengerId,% survived
0,C,93,168,0.55
1,Q,30,77,0.39
2,S,217,644,0.34


In [10]:
plt.figure(figsize=(4, 4))
ax = plt.gca()
ax.grid()
plt.bar([1, 2, 3], locations['% survived'])
plt.xticks([1, 2, 3])

ax.set_xticklabels(locations.Embarked)
ax.set_ylim([0, 1])
ax.set_ylabel("% survived");
ax.set_xlabel("Embark Port");

<IPython.core.display.Javascript object>

In [11]:
train.Age.median(), round(train.Age.mean(), 0)

(28.0, 30.0)

In [12]:
train.Cabin.value_counts(dropna=False)[:10]

NaN            687
G6               4
C23 C25 C27      4
B96 B98          4
F2               3
F33              3
D                3
C22 C26          3
E101             3
C125             2
Name: Cabin, dtype: int64

In [13]:
train.drop(['Cabin', 'Ticket'], axis=1, inplace=True)

In [14]:
gender = train[['Sex', 'Survived']].groupby("Sex").agg([np.sum, np.size])
gender['% survived'] = gender['Survived']['sum'] / gender['Survived']['size']
gender

Unnamed: 0_level_0,Survived,Survived,% survived
Unnamed: 0_level_1,sum,size,Unnamed: 3_level_1
Sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
female,233,314,0.742038
male,109,577,0.188908


In [15]:
for df in [train, test]:
    df.Sex = df.Sex.map({'male' : 0, 'female' : 1})
    df.Embarked = df.Embarked.map({'S': 0, 'C' : 1, 'Q' : 2})
    df.Embarked.fillna(value=train.Embarked.value_counts(sort=True).index[0], inplace=True)
    df.Age.fillna(value=29, inplace=True)

In [16]:
train.groupby("Pclass").agg({'Fare' : np.mean})

Unnamed: 0_level_0,Fare
Pclass,Unnamed: 1_level_1
1,84.154687
2,20.662183
3,13.67555


In [17]:
class_survival = train[['Pclass', 'Survived']].groupby("Pclass").agg([np.size, np.sum])
class_survival['% survived'] = round(class_survival['Survived']['sum'] /  class_survival['Survived']['size'], 2)
class_survival

Unnamed: 0_level_0,Survived,Survived,% survived
Unnamed: 0_level_1,size,sum,Unnamed: 3_level_1
Pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,216,136,0.63
2,184,87,0.47
3,491,119,0.24


In [18]:
train['title'] = train.Name.str.findall("[a-zA-Z+],\s(.+?)\s").apply(lambda x: x[0]).str.strip(".")
test['title'] = test.Name.str.findall("[a-zA-Z+],\s(.+?)\s").apply(lambda x: x[0]).str.strip(".")

In [19]:
train['title'].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Major         2
Mlle          2
Col           2
Ms            1
Mme           1
the           1
Don           1
Sir           1
Lady          1
Jonkheer      1
Capt          1
Name: title, dtype: int64

In [20]:
#Filling the uncommon titles in the best way

for df in [train, test]:
    df['title'].replace("Mlle|Lady", "Miss", inplace=True, regex=True)
    df['title'].replace("Major|Col|Capt", "military title", inplace=True, regex=True)
    df['title'].replace("Mme|Ms|Dona", "Mrs", inplace=True, regex=True)
    df['title'].replace("Sir|Jonkheer|the|Don", "Mrs", inplace=True, regex=True)


In [21]:
title_aggr = train[['title', 'Survived']].groupby("title").agg([np.sum, np.size])
title_aggr['% survived'] = round(title_aggr['Survived']['sum'] / title_aggr['Survived']['size'], 2)
title_aggr

Unnamed: 0_level_0,Survived,Survived,% survived
Unnamed: 0_level_1,sum,size,Unnamed: 3_level_1
title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Dr,3,7,0.43
Master,23,40,0.57
Miss,130,185,0.7
Mr,81,517,0.16
Mrs,103,131,0.79
Rev,0,6,0.0
military title,2,5,0.4


In [22]:
sib_aggr = train[['Survived', 'SibSp']].groupby("SibSp").agg([np.size, np.sum])
sib_aggr['% survived'] = round(sib_aggr['Survived']['sum'] / sib_aggr['Survived']['size'], 2)
sib_aggr

Unnamed: 0_level_0,Survived,Survived,% survived
Unnamed: 0_level_1,size,sum,Unnamed: 3_level_1
SibSp,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,608,210,0.35
1,209,112,0.54
2,28,13,0.46
3,16,4,0.25
4,18,3,0.17
5,5,0,0.0
8,7,0,0.0


In [23]:
par_aggr = train[['Survived', 'Parch']].groupby("Parch").agg([np.size, np.sum])
par_aggr['% survived'] = round(sib_aggr['Survived']['sum'] / sib_aggr['Survived']['size'], 2)
par_aggr

Unnamed: 0_level_0,Survived,Survived,% survived
Unnamed: 0_level_1,size,sum,Unnamed: 3_level_1
Parch,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,678,233,0.35
1,118,65,0.54
2,80,40,0.46
3,5,3,0.25
4,4,0,0.17
5,5,1,0.0
6,1,0,


In [24]:
def age_cohorts(age):
    cohorts = np.linspace(train.Age.min(), train.Age.max(), num=20)
    i = 0
    for cohort in cohorts:
        if age <= cohort:
            return i
        else:
            i += 1
            
train['age_cohort'] = train.Age.apply(age_cohorts)
test['age_cohort'] = test.Age.apply(age_cohorts)

In [25]:
# Let's see if age cohort has big influence within each gender

In [26]:
male_cohorts = train.loc[train.Sex == 0][['age_cohort', 'Survived']].groupby("age_cohort").agg([np.size, np.sum])
male_cohorts['% survived'] = round(male_cohorts['Survived']['sum'] / male_cohorts['Survived']['size'], 2)
male_cohorts

Unnamed: 0_level_0,Survived,Survived,% survived
Unnamed: 0_level_1,size,sum,Unnamed: 3_level_1
age_cohort,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,1,1,1.0
1,22,14,0.64
2,5,2,0.4
3,9,4,0.44
4,21,2,0.1
5,62,7,0.11
6,59,7,0.12
7,182,30,0.16
8,50,11,0.22
9,49,11,0.22


In [27]:
female_cohorts = train.loc[train.Sex == 1][['age_cohort', 'Survived']].groupby("age_cohort").agg([np.size, np.sum])
female_cohorts['% survived'] = round(female_cohorts['Survived']['sum'] / female_cohorts['Survived']['size'], 2)
female_cohorts

Unnamed: 0_level_0,Survived,Survived,% survived
Unnamed: 0_level_1,size,sum,Unnamed: 3_level_1
age_cohort,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,17,12,0.71
2,9,7,0.78
3,6,0,0.0
4,23,19,0.83
5,29,19,0.66
6,38,30,0.79
7,78,54,0.69
8,29,23,0.79
9,25,23,0.92
10,19,14,0.74


In [28]:
title_dict = pd.Series(train['title'].value_counts().index).to_dict()
title_dict = dict(zip(title_dict.values(), title_dict.keys()))

In [29]:
train['title_cat'] = train['title'].map(title_dict)
test['title_cat'] = test['title'].map(title_dict)

### Machine Learning

In [30]:
ml_train = train[['Pclass', 'Sex', 'age_cohort', 'SibSp', 'Parch', 'Embarked', 'title_cat', 'Survived']]
ml_test = test[['Pclass', 'Sex', 'age_cohort', 'SibSp', 'Parch', 'Embarked', 'title_cat']]

### KNN

In [31]:
'''Knn can be used for both classification and regression
It's a memory-based model, memorizes all examples from the train set
It looks at the position of a new instance, then looks at the class of its neighbors 
and makes the decision

Things to tweak:

1) Distance metric
2) How many neighbors?
3) Aggregation of votes (simple majority vote? should closest neighbors have more weight?)
''';

In [32]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

X = ml_train[['Pclass', 'Sex', 'age_cohort', 'SibSp', 'Parch', 'Embarked', 'title_cat']]
y = ml_train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [33]:
res = {}
for n in range(3, 20):
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train, y_train)
    score = knn.score(X_test, y_test)
    res[n] = score

In [34]:
max(res.values())

0.81165919282511212

In [35]:
res

{3: 0.78026905829596416,
 4: 0.79372197309417036,
 5: 0.7982062780269058,
 6: 0.81165919282511212,
 7: 0.7847533632286996,
 8: 0.80269058295964124,
 9: 0.7982062780269058,
 10: 0.80269058295964124,
 11: 0.80717488789237668,
 12: 0.80717488789237668,
 13: 0.79372197309417036,
 14: 0.81165919282511212,
 15: 0.79372197309417036,
 16: 0.79372197309417036,
 17: 0.78026905829596416,
 18: 0.76681614349775784,
 19: 0.79372197309417036}

In [36]:
plt.figure()
plt.scatter(list(res.values()), list(res.keys()))

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x7efdad06e198>

In [37]:
res = {}
for n in range(3, 30):
    knn = KNeighborsClassifier(n_neighbors=n, weights="distance")
    knn.fit(X_train, y_train)
    score = knn.score(X_test, y_test)
    res[n] = score

In [38]:
max(res.values())

0.7982062780269058

In [39]:
plt.figure()
plt.scatter(list(res.values()), list(res.keys()))

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x7efdad02d470>

In [40]:
# What about running this with less features?

from sklearn.model_selection import train_test_split

X = ml_train[['Pclass', 'Sex', 'Embarked']]
y = ml_train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

res = {}
for n in range(3, 30):
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train, y_train)
    score = knn.score(X_test, y_test)
    res[n] = score

In [41]:
res

{3: 0.7847533632286996,
 4: 0.80717488789237668,
 5: 0.78923766816143492,
 6: 0.81165919282511212,
 7: 0.81165919282511212,
 8: 0.81165919282511212,
 9: 0.81165919282511212,
 10: 0.81165919282511212,
 11: 0.81165919282511212,
 12: 0.81165919282511212,
 13: 0.81165919282511212,
 14: 0.81165919282511212,
 15: 0.81165919282511212,
 16: 0.81165919282511212,
 17: 0.81165919282511212,
 18: 0.81165919282511212,
 19: 0.81165919282511212,
 20: 0.81165919282511212,
 21: 0.81165919282511212,
 22: 0.81165919282511212,
 23: 0.81165919282511212,
 24: 0.81165919282511212,
 25: 0.81165919282511212,
 26: 0.81165919282511212,
 27: 0.81165919282511212,
 28: 0.81165919282511212,
 29: 0.81165919282511212}

In [42]:
max(res.values())

0.81165919282511212

### KNN can be also used for regression:
from sklear.neighbors import KNeighborsRegressor

Its performance is evaluated by using the R-squared score (also called Coefficient of Determiation),
with 0 being just the mean value of all the training values (worst)
to 1 being the perfect prediction (best).

### Linear Models
They determine the output value in terms of a sum of weighted input variables.
There is the w variable (the slope of the line) and b variable (the y-intercept)

<h4>Linear Regression</h4>

It makes sense to use if you expect the data to have a linear relationship (the bigger the house, the higher the prce). You can imagine this model as a scatter plot (X - house size, Y - house price) and the task of the model is to draw a line across these points that is goes right through the middle. Linear models make a strong assumption that the data is linear.

<h4>Least Squares Linear Regression</h4>

Finds the best-fit line that minimizes that mean square error of the model. The square error is calculated by the distance from each point to the line we draw. The goal is to minimize the error so that the line runs as close to all the points as possible.

from sklearn.linear_model import LinearRegression<br>
linreg = LinearRegression().fit(X_train, y_train)<br>
print(linreg.intercept_) (this is the y-intercept)<br>
print(linreg.coef_) (this is the w - weight)<br>


### Linear models : Ridge, Lasso, Polynomial Regression

<h4>Ridge Regression</h4>

Uses the same leads squares method, but it adds a penalty (L2 penalty) for feature weights that are too large.
This penalty is called regularization, and it prevents overfitting.
You can control the severity of this L2 penalty by using the parameter Alpha.

from sklearn.linear_model import Ridge<br>
ridge = Ridge(alpha=20).fit(X_train, y_train)

<h4>Lasso Regression</h4>

Does pretty much the same as Ridge, but it uses the L1 penalty model, which computes the sum of absolute values of the coefficients instead of sum of squares. The outcome is an automatic feature selection - the model assigns the weight 0 to the least influential features. Lasso regression is a good choice is you except only a handful of features to have an influence on the outcome. If this is not the case, Ridge is a better choice.
Lasso is nice because you can quickly see which features are most important, and you can use this knowledge later on, even in different models.

<h4>Polynomial Regression</h4>

Polynomial Regression takes the existing features, and creates new features, derived from existing features, using multiplication. So if you have 2 features to start with, it will create 5 more features derived from the original 2 features. This transforms the problem into a higher-dimensional regression space. You can think of the outcome as allowing a curved line to be fit to the data, instead of the straight line, you gain flexibility.

### Feature preprocessing and regularization

If features have very different scales (like weight and height), the L2 regularization will penalize higher numbers more. So it is important to make sure that all feature values are on the sale scale.

from sklearn.preprocessing import MinMaxScaler<br>
scaler = MinMaxScaler()<br>
scaler.fit(X_train)<br>
X_train_scaled = scaler.transform(X_train)<br>
X_test_scaled = scaler.transform(X_test)<br>

or in one line:
X_train_Scaled = scaler.fit_transform(X_train)

#### It is very important to normalize both the training and test data!

### Logistic Regression

It is a classification model, with the output being a binary value. It is the same as Linear Regression (calculates the slope w and y-intercept b) with one difference:<br>
It runs this result thru a special non-linear function f.  It's an S-shaped function. The effect of this function is to compress its output to a range 0-1 (because its used for binary classification).


'''''''

In [None]:
'''
Idea:
Scape the immobilien scout data and extract features from which ad.
See what feature correlate most with the price (Lasso Regression)
Plot price against other features.
Is it the square meters? or the district? Baujahr?
How linear does the price rise with the square meters?
How many rooms will give you the best deal?
Pull the words from the headline and correlate them to price



'''