# Introduction

We performed Exploratory Data Analysis (EDA) in order to gain key insights from the data before building our ML models. This gave us an idea of how the data was structured amongst the large number of features and groups present within the data set.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

train_df = pd.read_feather('../input/training-data-to-feather-python-r-low-mem/train.feather')

# Observations, Time Steps & Assets

In [None]:
observations = train_df.shape[0]
print(f"number of observations: {observations}")

time_steps = train_df.time_id.nunique()
print(f"number of time steps: {time_steps}")

assets = train_df.investment_id.nunique()
print(f"number of assets: {assets} (range from {train_df.investment_id.min()} to {train_df.investment_id.max()}")

# Target Analysis

### Target by Asset Distribution

In [16]:
observations_by_asset = train_df.groupby(['investment_id'])['target'].count()

fig, ax = plt.subplots(1, 1, figsize=(12, 6))
observations_by_asset.plot.hist(bins=60)
plt.title("target by asset distribution")
plt.show()

### Mean Target Distribution

In [18]:
mean_target = train_df.groupby(['investment_id'])['target'].mean()
mean_mean_target = np.mean(mean_target)

fig, ax = plt.subplots(1, 1, figsize=(12, 6))
mean_target.plot.hist(bins=60)
plt.title("mean target distribution")
plt.show()

print(f"Mean of mean target: {mean_mean_target: 0.5f}")

### Standard Deviation of Target Distribution

In [19]:
sts_target = train_df.groupby(['investment_id'])['target'].std()
mean_std_target = np.mean(sts_target)

fig, ax = plt.subplots(1, 1, figsize=(12, 6))
sts_target.plot.hist(bins=60)
plt.title("standard deviation of target distribution")
plt.show()

print(f"Mean of std target: {mean_std_target: 0.5f}")

### Number of Unique Assets, Average Target and Std of Target by Time

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(3, 1, 1,)
(train_df.groupby('time_id')['investment_id'].nunique()).plot()
plt.title("number of unique assets by time")

plt.subplot(3, 1, 2)
train_df.groupby('time_id')['target'].mean().plot()
plt.title("average target by time")
plt.axhline(y=mean_mean_target, color='r', linestyle='--', label="mean")
plt.legend(loc='lower left')

plt.subplot(3, 1, 3)
train_df.groupby('time_id')['target'].std().plot()
plt.title("std of target by time")
plt.axhline(y=mean_std_target, color='r', linestyle='--', label="mean")
plt.legend(loc='lower left')

plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=1.3, 
                    wspace=0.4, 
                    hspace=0.4)

plt.show()

### Target Mean and Standard Deviation by Time

In [21]:
time2target_mean = train_df.groupby(['time_id'])['target'].mean()
time2target_std = train_df.groupby(['time_id'])['target'].std()

_, axes = plt.subplots(1, 1, figsize=(24, 12))
plt.fill_between(
        time2target_mean.index,
        time2target_mean - time2target_std,
        time2target_mean + time2target_std,
        alpha=0.1,
        color="b",
    )
plt.plot(
        time2target_mean.index, time2target_mean, "o-", color="b", label="Training score"
    )
plt.axhline(y=mean_mean_target, color='r', linestyle='--', label="mean")
axes.set_ylabel("target")
axes.set_xlabel("time")
plt.show()