In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kurtosis, skew
import sklearn.feature_selection as fs
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Andrea's data analysis and exploration for HCC1806 SMARTS

In [2]:
unfilt_hcc_smarts = pd.read_csv("data/smartSeq/HCC1806_SmartS_Unfiltered_Data.txt", sep = " ")
meta_hcc_smarts = pd.read_table("data/smartSeq/HCC1806_SmartS_MetaData.tsv")

FileNotFoundError: [Errno 2] No such file or directory: 'data/smartSeq/HCC1806_SmartS_Unfiltered_Data.txt'

In [None]:
print(unfilt_hcc_smarts.head())

First we look at how the dataset is structured, to have a gist of the data.

The columns of the HCC1806 SmartSeq dataset represent the different cells from the cell line. They are 243 in total.

The rows of the HCC1806 SmartSeq dataset represent the different genes present in each cell and in what quantity. They are 23396 in total.

## How much data is null in the files?

In [None]:
print(unfilt_hcc_smarts.isnull().sum())
print(f"\nColumns with null data: {unfilt_hcc_smarts.columns[unfilt_hcc_smarts.isnull().any()].to_list()}")

The number of columns with null data is: 0. As shown above.

## Outliers

In [None]:
plt.figure(figsize=(16,4))
plot=sns.violinplot(data=unfilt_hcc_smarts.iloc[:, :50],palette="Set3",cut=0)
plt.setp(plot.get_xticklabels(), rotation=90)

In [None]:
no_out_hcc_smarts = unfilt_hcc_smarts[(np.abs(stats.zscore(unfilt_hcc_smarts)) < 3).all(axis=1)]
no_out_hcc_smarts

### We Remove all rows that have outliers in, at least, one column.
With the above outlier removal process, we only keep data in the middle 99.7% by removing all elements with a z-score > 3.

In [None]:
plt.figure(figsize=(16,4))
plot=sns.violinplot(data=no_out_hcc_smarts.iloc[:, :50],palette="Set3",cut=0)
plt.setp(plot.get_xticklabels(), rotation=90)

We can see that despite having removed outliers, the distribution of the data has not really changed. This can be attributed to the fact that most of the data is 0's and hence the true information is held by non-zero data (the dataframe is sparse).

We will discard the removal of outliers and use different methods of data cleaning.

## Data Normalisation

In [None]:
from fix_data import add_label_T

transposed_df = add_label_T(unfilt_hcc_smarts)
transposed_df.reset_index(drop=True)

In [None]:
target = transposed_df['label']
transposed_df.drop(["label"], axis=1, inplace=True)

In [None]:
# Instantiate VarianceThreshold object
threshold = 0.05
selector = fs.VarianceThreshold(threshold)

# Fit selector to DataFrame
selector.fit(transposed_df)

# Get boolean mask of features that meet threshold
mask = selector.get_support()

# Get list of column names that meet threshold
near_zero_var_cols = transposed_df.columns[~mask].tolist()

# Print the result
print(f'Columns with < {threshold} variance: {near_zero_var_cols}')
print(f"In total: {len(near_zero_var_cols)}")
var_fixed = transposed_df.drop(near_zero_var_cols, axis=1)
var_fixed

In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(var_fixed)
scaled_df = pd.DataFrame(scaled_features, index=var_fixed.index, columns=var_fixed.columns)
scaled_df['label'] = target
scaled_df

In [None]:
target = scaled_df['label']
scaled_df.drop(['label'], axis=1, inplace=True)


In [None]:
accuracies = []
for i in range(1,100):
    pca = PCA(n_components=i)
    pca.fit_transform(scaled_df)
    accuracies.append(sum(pca.explained_variance_ratio_))

plt.plot([i for i in range(1,100)], accuracies)