In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from matplotlib.colors import ListedColormap
from sklearn.metrics import accuracy_score
from collections import Counter


In [2]:
# Prepocessing data
# Source lecture DAT200. File: rawDataInspection_01.py

In [38]:
df = pd.read_csv('CA3-train.csv')
df_test = pd.read_csv('CA3-test.csv')

In [50]:
# Search for missing values
missing = np.asarray(df_test.isnull().sum())
if missing.any():
    print("Dataset has missing values")
else:
    print('No missing values!')

No missing values!


In [51]:
# features index
c_first = 1
c_last = 25    # not included
# Assign features to X matrix. Assign to X_train and y_train
X_train, y_train = df.iloc[:, c_first:c_last].values, df.iloc[:, 25]
print(f"Selected features:", df.iloc[:, c_first:c_last].columns)

Selected features: Index(['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11',
       'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21',
       'f22', 'f23', 'f24'],
      dtype='object')


In [52]:
# Assign X_test
X_test = df_test.iloc[:, c_first:c_last].values

In [54]:
# Default parameters
seed = 1

# Standardizing our data to make algorithms behave better
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

In [65]:
# Fit Random forest and calculate accuracy for different train_test_splits
forest = RandomForestClassifier(n_estimators = 100, criterion='gini', max_depth=25, n_jobs = -1, random_state=seed)
forest.fit(X_train_std, y_train)
y_pred = forest.predict(X_test_std)    # predicted class labels

In [79]:
y_pred

array([0., 2., 0., ..., 0., 0., 1.])

In [101]:
# Count each predicted class label
c = Counter(y_pred)
for i in range(3):
    print("Class", i, ":", c[i])
    print("Share of total: {:.3}".format(c[i]/sum(c.values())))

Class 0 : 1271
Share of total: 0.332
Class 1 : 1702
Share of total: 0.445
Class 2 : 855
Share of total: 0.223


In [None]:
# Calculate accuracy without feature extraction
fit_test_size(forest, X, y, test_size_list, seed)

In [None]:
# Calculate accuracy with feature extraction
fit_test_size(forest, X, y, test_size_list, seed, feature_extraction=LDA, n_components=2)

In [None]:
# Calculate accuracy with feature extraction
fit_test_size(forest, X, y, test_size_list, seed, feature_extraction=PCA, n_components=10)

### Looking into the data

comment: Might skip this in final version

* Search for correlations
* Look for outliers
* Visualize

In [None]:
df_X = df.iloc[:, 1:25]

In [None]:
# =============================================================================
# Descriptive statistics
# =============================================================================
df_X.describe()
#df.iloc[:, 16].describe()

In [None]:
# Check for positive values f16

positive_f16 = np.where(df['f16']>0, True, False)
# count True
positive_f16.sum()

In [None]:
# Drop rows with zero values

extract_positive_f16 = df[df['f16']>0]['f16']
extract_positive_f16

In [None]:
# Histogram positive f16
extract_positive_f16.hist()

In [None]:
# Descriptive statistics for posive f16

extract_positive_f16.describe()

In [None]:
# =============================================================================
# Histograms
# =============================================================================

df.iloc[:, 16].hist()
plt.tight_layout()
plt.show()

Notes AH: Possible outliers f16, very high max relative to the rest.
Also a lot of zero values more than two-thirds. Consider dropping the column.

In [None]:
# =============================================================================
# Density plots
# =============================================================================

df['f16'].plot(kind='density')
plt.show()

In [None]:
# Sort values for f16 in descending order

df['f16'].sort_values(ascending=False)[0:50]

In [None]:
df.sort_values(by=['f16'], ascending=False)[0:30]

Note: Skip the six first datapoints from the sorting above

In [None]:
# =============================================================================
# Plot correlation matrix
# =============================================================================

# plot correlation matrix for the first four features
df_sub = df.iloc[:, 21:25]
correlations = df_sub.corr()

fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0, 5, 1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(list(df_sub.columns))
ax.set_yticklabels(list(df_sub.columns))
plt.tight_layout()
plt.show()

In [None]:
correlations

In [None]:
corr_f15f16 = df.iloc[:, [15, 16]].corr()
corr_f15f16

In [None]:
# correlations all f16
corr_all = df_X.corr()
corr_all.iloc[:, 19]

In [None]:
# Standardizing

sc = StandardScaler()
df_X_std = sc.fit_transform(df_X)
df_X_std


In [None]:
plt.scatter(df_X_std[14], df_X_std[15])