In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

%matplotlib inline

In [2]:
df = pd.read_csv('../archive/Social_Network_Ads.csv')
df.head()

In [5]:
# Setting up the X and y values
y = df['Purchased']
Xs = df.drop(['User ID', 'Purchased'], axis=1)


# Fixing the categorical data
gender_dummies = pd.get_dummies(df['Gender'])
gender_dummies = gender_dummies['Female']

pd.to_numeric(gender_dummies, errors='coerce')
Xs = pd.concat([Xs, gender_dummies], axis=1)

Xs.drop(['Gender'], inplace=True, axis=1)
Xs.head()

Unnamed: 0,Age,EstimatedSalary,Female
0,19,19000,0
1,35,20000,0
2,26,43000,1
3,27,57000,1
4,19,76000,0


In [6]:
# Splitting the data set into training and testing

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=0.25, random_state=0)

In [7]:
# Feature Scaling

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)

# We are only using the transform and not the fit transform because we already fit the model
X_test = sc_X.transform(X_test)

In [8]:
Xs.dtypes

Age                int64
EstimatedSalary    int64
Female             uint8
dtype: object

In [9]:
# Fitting the classifier to the training set

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

In [11]:
# Predicting the values of the testing set
y_pred = classifier.predict(X_test)

In [12]:
# Making the classification matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [13]:
cm

array([[65,  3],
       [ 6, 26]])

In [14]:
tn, fp, fn, tp = cm.ravel()
print(tn, fp, fn, tp)

65 3 6 26


- Confusion Matrix: 
    - We correctly predicted negative or no to 65 cases
    - We correctly predicted positive or yes to 26 cases
    - We incorrectly predicted positive when it should have been negative for 3 cases
    - We incorrectly predicted negative when it should have been positive for 6 cases

In [21]:

# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Logistic Regression (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

In [33]:
# Ravel just flattenes the array
print(X1.ravel())

# Transpose
print(np.array([X1.ravel()]).T)

# unique vals
print(np.unique(Y_set))

[-2.99318916 -2.98318916 -2.97318916 ...,  3.13681084  3.14681084
  3.15681084]
[[-2.99318916]
 [-2.98318916]
 [-2.97318916]
 ..., 
 [ 3.13681084]
 [ 3.14681084]
 [ 3.15681084]]
[0 1]
