In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from matplotlib.colors import ListedColormap
from sklearn.metrics import accuracy_score

In [2]:
# Prepocessing data
# Source lecture DAT200. File: rawDataInspection_01.py

In [3]:
df = pd.read_csv('CA3-train.csv')

In [4]:
# Search for missing values
missing = np.asarray(df.isnull().sum())
if missing.any():
    print("Dataset has missing values")
else:
    print('No missing values!')

No missing values!


In [43]:
# features index
c_first = 1
c_last = 25    # not included
# Assign features to X matrix and corresponding labels to vector y
X, y = df.iloc[:, c_first:c_last].values, df.iloc[:, 25]
print(f"Selected features:", df.iloc[:, c_first:c_last].columns)

Selected features: Index(['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11',
       'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21',
       'f22', 'f23', 'f24'],
      dtype='object')


In [44]:
# print out the unique class labels
np.unique(y)

array([0., 1., 2.])

In [45]:
# Default parameters
seed = 1
test_size = 0.3

# Splitting data with default parameters
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y,
                                                    random_state=seed)

# Standardizing our data to make algorithms behave better
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.fit_transform(X_test)

In [46]:
# Function to plot decision regions. Works only when two features are selected
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):

    # Source Python for Machine Learning ch05
    # setup marker generator and colormap
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(xx1.min(), xx2.max())
    plt.ylim(xx2.min(), xx2.max())

    # plot examples by class
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
                    alpha=0.8, c=colors[idx],
                    marker=markers[idx], label=cl,
                    edgecolor='black')
    # highlight test examples
    if test_idx:
        # plot all examples
        X_test, y_test = X[test_idx, :], y[test_idx]

        plt.scatter(X_test[:, 0], X_test[:, 1],
                    c='', edgecolor='black', alpha=1.0,
                    linewidth=1, marker='o',
                    s=100, label='test set')
    plt.xlabel('First feature [standardized]')
    plt.ylabel('Second feature [standardized]')
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.show()

In [47]:
def combined(X_train, X_test, y_train, y_test):
    # Stacking the data before plotting
    X_combined = X_train + X_test
    y_combined = y_train + y_test
    return X_combined, y_combined

In [48]:
def fit_test_size(classifier, X, y, test_size_list, seed, feature_extraction=False, n_components=None):
    # Accuracy for different test_train_splits
    for size in test_size_list:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size, stratify=y,
                                                            random_state=seed)
        X_train_std = sc.fit_transform(X_train)
        X_test_std = sc.fit_transform(X_test)
        if feature_extraction:
            # Dimensionality reduction through PCA
            pca = PCA(n_components=n_components, random_state=1)
            X_train_pca = pca.fit_transform(X_train_std)
            X_test_pca = pca.fit_transform(X_test_std)
            classifier.fit(X_train_pca, y_train)
            y_pred = classifier.predict(X_test_pca)
            print(f'Misclassified examples PCA: {(y_test != y_pred).sum()}')
            print('Accuracy PCA: {:.3}'.format(classifier.score(X_test_pca, y_test)))
            print(f'Test size: {size}')
        else:
            classifier.fit(X_train_std, y_train)
            y_pred = classifier.predict(X_test_std)
            print(f'Misclassified examples: {(y_test != y_pred).sum()}')
            print('Training Accuracy: {:.3}'.format(classifier.score(X_train_std, y_train)))
            print('Accuracy: {:.3}'.format(classifier.score(X_test_std, y_test)))
            print(f'Test size: {size}')
    # Note: After function call test size is the last index of test_size_list

In [49]:
# plot cumulative sum of explained variances
def plot_var_exp(n_components):
    pca = PCA(n_components=n_components, random_state=1)
    pca.fit_transform(X_train_std)
    pca.fit_transform(X_test_std)
    var_exp = pca.explained_variance_ratio_
    cum_var_exp = np.cumsum(var_exp)
    plt.bar(range(1, n_components + 1), var_exp, alpha=0.5, align='center',
            label='Individual explained variances')
    plt.step(range(1, n_components + 1), cum_var_exp, where='mid',
             label='Cumulative explained variances')
    plt.xlabel('Explained variance ratio')
    plt.ylabel('Principal component index')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [50]:
# Fit Logistic regression and calculate accuracy for different train_test_splits
C_list = [100, 10, 1, 0.1, 0.01]
max_score = 0
for C in C_list:
    lr_temp = LogisticRegression(penalty='l1', C=C, solver='liblinear', random_state=seed)
    lr_temp.fit(X_train_std, y_train)
    score = lr_temp.score(X_test_std, y_test)
    if score > max_score:
        max_score = score
        C_best = C

print("Best C: ", C_best)
print("Max acurracy score: {:.3}".format(max_score))
print("Test size: ", test_size)

Best C:  1
Max acurracy score: 0.492
Test size:  0.3


In [51]:
# Fit Logistic regression and calculate accuracy for different train_test_splits
test_size_list = [0.6, 0.3, 0.1, 0.05, 0.01]
lr = LogisticRegression(penalty='l1', C=0.1, solver='liblinear', random_state=seed)
fit_test_size(lr, X, y, test_size_list, seed)

Misclassified examples: 2142
Training Accuracy: 0.519
Accuracy: 0.498
Test size: 0.6
Misclassified examples: 1084
Training Accuracy: 0.526
Accuracy: 0.492
Test size: 0.3
Misclassified examples: 350
Training Accuracy: 0.514
Accuracy: 0.508
Test size: 0.1
Misclassified examples: 170
Training Accuracy: 0.513
Accuracy: 0.522
Test size: 0.05
Misclassified examples: 31
Training Accuracy: 0.513
Accuracy: 0.569
Test size: 0.01


In [None]:
# Calculate accuracy with feature extraction
fit_test_size(lr, X, y, test_size_list, seed, feature_extraction=True, n_components=5)

### Looking into the data

comment: Might skip this in final version

* Search for correlations
* Look for outliers
* Visualize

In [None]:
df_X = df.iloc[:, 1:25]

In [None]:
# =============================================================================
# Descriptive statistics
# =============================================================================
df_X.describe()
#df.iloc[:, 16].describe()

In [None]:
# Check for positive values f16

positive_f16 = np.where(df['f16']>0, True, False)
# count True
positive_f16.sum()

In [None]:
# Drop rows with zero values

extract_positive_f16 = df[df['f16']>0]['f16']
extract_positive_f16

In [None]:
# Histogram positive f16
extract_positive_f16.hist()

In [None]:
# Descriptive statistics for posive f16

extract_positive_f16.describe()

In [None]:
# =============================================================================
# Histograms
# =============================================================================

df.iloc[:, 16].hist()
plt.tight_layout()
plt.show()

Notes AH: Possible outliers f16, very high max relative to the rest.
Also a lot of zero values more than two-thirds. Consider dropping the column.

In [None]:
# =============================================================================
# Density plots
# =============================================================================

df['f16'].plot(kind='density')
plt.show()

In [None]:
# Sort values for f16 in descending order

df['f16'].sort_values(ascending=False)[0:50]

In [None]:
df.sort_values(by=['f16'], ascending=False)[0:30]

Note: Skip the six first datapoints from the sorting above

In [None]:
# =============================================================================
# Plot correlation matrix
# =============================================================================

# plot correlation matrix for the first four features
df_sub = df.iloc[:, 21:25]
correlations = df_sub.corr()

fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0, 5, 1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(list(df_sub.columns))
ax.set_yticklabels(list(df_sub.columns))
plt.tight_layout()
plt.show()

In [None]:
correlations

In [None]:
corr_f15f16 = df.iloc[:, [15, 16]].corr()
corr_f15f16

In [None]:
# correlations all f16
corr_all = df_X.corr()
corr_all.iloc[:, 19]

In [None]:
# Standardizing

sc = StandardScaler()
df_X_std = sc.fit_transform(df_X)
df_X_std


In [None]:
plt.scatter(df_X_std[14], df_X_std[15])