In [None]:
# Let's import the bread and butter packages of every aspiring data scientist
import numpy as np
import pandas as pd

In [None]:
# Read the raw data...
data = pd.read_csv('data.csv')

In [None]:
# ...and have a peak
data.head()

In [None]:
# Check for any missing data before it gets us in trouble
data.isna().any()

In [None]:
# Yeah, guess we need to handle that.... What is this data anyway??
data.describe()

In [None]:
# Hmm seems normal-ish distributed, let's just fill with mean
data = data.fillna(data.mean())

OK let's try naively plotting some columns against each other... maybe we spot something interesting.

In [None]:
import matplotlib.pyplot as plt

# Pick some arbitrary column combinations
column_combinations = np.array([
    ['1', '4'],
    ['0', '1'],
    ['2', '4'],
    ['3', '4'],
])

# Create a 2x2 subplot grid for scatter plots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 8))

for i in range(2):
    for j in range(2):
        ax = axes[i][j]
        col1, col2 = column_combinations[i + 2 * j][0], column_combinations[i + 2 * j][1]
        ax.scatter(data[col1], data[col2], alpha=0.5)
        ax.set_xlabel(col1)
        ax.set_ylabel(col2)

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

Is it just noise?! But I mean, surely the guy wouldn't just generate a random array and send it over to me... what's that on the last plot? That looks kinda structured... Alright let's find the structure, PCA to the rescue!

In [None]:
from sklearn.decomposition import PCA

# Initialize the PCA model with the desired number of components
n_components = 6  # You can adjust this based on your needs
pca = PCA(n_components=n_components)

# Fit the PCA model to your dataset
pca.fit(data)

# Transform the original data to the new reduced dimensionality
X_pca = pca.transform(data)

# Print the explained variance ratio for each component
print("Explained Variance Ratio:", pca.explained_variance_ratio_)

# Plot the PCA-transformed data
plt.scatter(X_pca[:, 1], X_pca[:, 0], alpha=0.6)
plt.title("PCA Transformed Data")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")

plt.show()
