#HW1 - Data Exploration and Preparation

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Loading the dataset

In [7]:
dataset = pd.read_csv('virus_data.csv')

In [8]:
dataset.shape

(1250, 26)

In [9]:
def visualize_clf(clf, X, Y, title, 
                  xlabel, ylabel,
                  marker_size=50,
                  grid_length=300,
                  linewidths=None):
    import matplotlib.pyplot as plt
    import pandas as pd
    from matplotlib.colors import ListedColormap
    
    if isinstance(X, pd.DataFrame):
        X = X.to_numpy()
        
    # For internal use here, make sure labels are 0 and 1
    Y = np.ravel(Y).astype(int)
    labels = set(Y)
    assert len(labels) == 2, "Can only visualize two unique labels"

    if labels == set([-1,1]):
      Y = (Y + 1) // 2
      labels = set(Y)
    
    assert labels == set([0,1]), "Could not handle given labels"

    plt.figure(figsize=(8, 8))

    # Parameters
    n_classes = 2
    markers = ["D", "o"]
    palette = sns.color_palette("hls", 2)
    custom_cmap = ListedColormap(palette.as_hex())

    x_delta = np.abs(X[:, 0].max() - X[:, 0].min()) * 0.1
    y_delta = np.abs(X[:, 1].max() - X[:, 1].min()) * 0.1
    x_min, x_max = X[:, 0].min() - x_delta, X[:, 0].max() + x_delta
    y_min, y_max = X[:, 1].min() - y_delta, X[:, 1].max() + y_delta
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, grid_length),
                         np.linspace(y_min, y_max, grid_length))
    plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    cs = plt.contourf(xx, yy, Z, cmap=custom_cmap, alpha=0.35)

    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.grid(alpha=0.75)

    # Plot the training points
    for i, color, marker in zip(labels, palette, markers):
        idx = np.where(Y == i)
        plt.scatter(X[idx, 0], X[idx, 1], color=color,
                    marker=marker,
                    edgecolor='white', s=marker_size,
                    linewidths=linewidths)

    plt.title(title, fontsize=20)
    plt.axis("tight")
    plt.show()

In [12]:
def plot3d(df, colX, colY, colZ, title, hue=None, s=1):
  %matplotlib inline

  fig = plt.figure(figsize=(20, 4))
  axes = [fig.add_subplot(1, 4, 1, projection='3d'),
          fig.add_subplot(1, 4, 2, projection='3d'),
          fig.add_subplot(1, 4, 3, projection='3d'),
          fig.add_subplot(1, 4, 4, projection='3d')]

  palette = sns.color_palette("hls", 2)

  for i,ax in enumerate(axes):
    if hue is None:
      ax.scatter3D(df[colX], df[colY], df[colZ], color=palette[0], s=s)
    else:
      idx = df[hue] == True
      df1 = df[idx]
      df2 = df[~idx]

      ax.scatter3D(df1[colX], df1[colY], df1[colZ], color=palette[0], s=s)
      ax.scatter3D(df2[colX], df2[colY], df2[colZ], color=palette[1], s=s)

    if i == 3 or i == 1:
      ax.set_xlabel(colX, labelpad=20)
      ax.set_ylabel(colY, labelpad=20)
    else:
      ax.xaxis.set_ticklabels([])
      ax.yaxis.set_ticklabels([])

    ax.view_init(elev=5., azim=45*i+15)

  axes[1].set_zlabel("\n" + colZ, labelpad=1)

  plt.suptitle(title)
  plt.subplots_adjust(wspace=0.1, top=0.99)
  plt.show()

In [14]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler


def prepare_data(training_data, new_data):

  # Copy the data
  train_df = training_data.copy()
  new_df = new_data.copy()

  # Add the SpecialProperty column & drop blood_type
  group_True = ['O+', 'B+']
  train_df['SpecialProperty'] = np.where(train_df['blood_type'].isin(group_True), 1, -1)
  new_df['SpecialProperty'] = np.where(new_df['blood_type'].isin(group_True), 1, -1)
  train_df = train_df.drop('blood_type', axis=1)
  new_df = new_df.drop('blood_type', axis=1)

  # Split the symptoms variable into columns
  train_df_new_cols = train_df['symptoms'].str.get_dummies(';')
  train_df_new_cols[train_df_new_cols == 0] = -1

  new_df_new_cols = new_df['symptoms'].str.get_dummies(';')
  new_df_new_cols[new_df_new_cols == 0] = -1

  train_df = pd.concat([train_df, train_df_new_cols], axis=1)
  new_df = pd.concat([new_df, new_df_new_cols], axis=1)

  train_df = train_df.drop('symptoms', axis=1)
  new_df = new_df.drop('symptoms', axis=1)

  # Split the gender column into "Male" and "Female"
  train_df['Female'] = np.where(train_df['sex'] == 'F', 1, -1)
  train_df['Male'] = np.where(train_df['sex'] == 'M', 1, -1)
  train_df = train_df.drop('sex', axis=1)

  new_df['Female'] = np.where(new_df['sex'] == 'F', 1, -1)
  new_df['Male'] = np.where(new_df['sex'] == 'M', 1, -1)
  new_df = new_df.drop('sex', axis=1)

  # Split location column into location_x and location_y
  train_df['location_x'] = [float(location.split("'")[1]) for location in train_df['current_location']]
  train_df['location_y'] = [float(location.split("'")[3]) for location in train_df['current_location']]
  train_df = train_df.drop('current_location', axis=1)

  new_df['location_x'] = [float(location.split("'")[1]) for location in new_df['current_location']]
  new_df['location_y'] = [float(location.split("'")[3]) for location in new_df['current_location']]
  new_df = new_df.drop('current_location', axis=1)

  train_df = train_df.drop('pcr_date', axis=1)
  train_df = train_df.drop('patient_id', axis=1)
  new_df = new_df.drop('pcr_date', axis=1)
  new_df = new_df.drop('patient_id', axis=1)


  # Columns designated for minmax scaling
  cols_for_minmax = ['PCR_01','PCR_02','PCR_03','PCR_06','cough','fever',
                    'shortness_of_breath','smell_loss','sore_throat','Female','Male', 'SpecialProperty']
  # Columns designated for standrad scaling
  cols_for_standard = ['age','weight','num_of_siblings','happiness_score','household_income',
                   'conversations_per_day','sugar_levels','sport_activity','location_x',
                   'location_y','PCR_04','PCR_05','PCR_07','PCR_08','PCR_09','PCR_10']

  # minmax scale all the minmax columns (in both training & test data, fit only training)
  scaler = MinMaxScaler(feature_range=(-1,1))
  scaler.fit(train_df[cols_for_minmax])
  new_df[cols_for_minmax] = scaler.transform(new_df[cols_for_minmax])

  # Standard scale all the standard columns
  scaler = StandardScaler()
  scaler.fit(train_df[cols_for_standard])
  new_df[cols_for_standard] = scaler.transform(new_df[cols_for_standard])

  return new_df

In [15]:
from sklearn.model_selection import train_test_split

data_train, data_test = train_test_split(dataset, test_size=0.2, train_size=0.8, random_state=25)

# Prepare training set according to itself
train_df_prepared = prepare_data(data_train, data_train)

# Prepare test set according to the raw training set
test_df_prepared = prepare_data(data_train, data_test)

# Export the training DataFrame to a CSV file
train_df_prepared.to_csv('training_preprocessed.csv', index=False)

# Export the test DataFrame to a CSV file
test_df_prepared.to_csv('test_preprocessesd.csv', index=False)

NameError: name 'data_train_original' is not defined