# **Assignment 3**
*Author: Logan Reine*

In [3]:
import pandas as pd
import matplotlib.pyplot as plot
import math
import numpy as np
import seaborn as sea
import warnings

# Deprecated component to the SPLOM in the seaborn package, this line of code silences the warning
warnings.filterwarnings("ignore", category = FutureWarning)
pd.options.mode.chained_assignment = None

# Load data sets
Pokedex = pd.read_csv("data/pokemon_data.tsv", delimiter = '\t')
continuous_Pokedex = Pokedex[["Base_Total", "HP", "Defense", "Attack", "Speed", "Sp_Attack", "Sp_Defense"]]
categorical_Pokedex = Pokedex[["Pokedex_ID", "Name", "Type_1", "Type_2", "Generation"]]

# Format all floating points to two decimal places
pd.set_option('display.float_format', '{:.2f}'.format)
# pd.reset_option('display.float_format')

# **1. Data Quality Reports**

### *Continuous Data*

In [None]:
selected_Features = continuous_Pokedex.columns.tolist()

con_Data_Quality = pd.DataFrame({
  'Feature': selected_Features,
  'Count':[continuous_Pokedex[x].count() for x in selected_Features],
  '% Miss.':[continuous_Pokedex[x].isna().sum() / len(continuous_Pokedex[x]) * 100 for x in selected_Features],
  'Card.':[continuous_Pokedex[x].nunique() for x in selected_Features],
  'Min.':[continuous_Pokedex[x].min() for x in selected_Features],
  '1st Qrt.':[continuous_Pokedex[x].quantile(.25) for x in selected_Features],
  'Mean':[continuous_Pokedex[x].mean() for x in selected_Features],
  'Median':[continuous_Pokedex[x].median() for x in selected_Features],
  '3rd Qrt.':[continuous_Pokedex[x].quantile(.75) for x in selected_Features],
  'Max.' :[continuous_Pokedex[x].max() for x in selected_Features],
  'Std. Dev.':[continuous_Pokedex[x].std() for x in selected_Features]
})

con_Data_Quality

### *Categorical Data*

In [None]:
selected_Features = categorical_Pokedex.columns.tolist()

cat_Data_Quality = pd.DataFrame({
    'Feature': selected_Features,
    'Count': [categorical_Pokedex[x].count() for x in selected_Features],
    '% Miss.': [categorical_Pokedex[x].isna().sum() / len(categorical_Pokedex[x]) * 100 for x in selected_Features],
    'Card.': [categorical_Pokedex[x].nunique() for x in selected_Features],
    'Mode': [categorical_Pokedex[x].mode().iloc[0] if not categorical_Pokedex[x].empty else None for x in selected_Features],
    'Mode Freq.': [categorical_Pokedex[x].value_counts().iloc[0] if not categorical_Pokedex[x].empty else None for x in selected_Features],
    'Mode %': [(categorical_Pokedex[x].value_counts().iloc[0] / len(categorical_Pokedex[x])) * 100 if not categorical_Pokedex[x].empty else None for x in selected_Features],
    '2nd Mode': [categorical_Pokedex[x].value_counts().index[1] if len(categorical_Pokedex[x].mode()) > 0 else None for x in selected_Features],
    '2nd Mode Freq.': [categorical_Pokedex[x].value_counts().iloc[1] if len(categorical_Pokedex[x].mode()) > 0 and not categorical_Pokedex[x].empty else None for x in selected_Features],
    '2nd Mode %': [(categorical_Pokedex[x].value_counts().iloc[1] / len(categorical_Pokedex[x])) * 100 if len(categorical_Pokedex[x].mode()) > 0 and not categorical_Pokedex[x].empty else None for x in selected_Features]
    })

cat_Data_Quality

# **2. Histograms of Continuous features**
### *Note: The number of bins used for each histogram were determind by taking the ceiling of the square root of that series' max.*

## *Feature: Base_Total*

In [None]:
plot.hist(continuous_Pokedex["Base_Total"], 
          bins = math.ceil(math.sqrt(continuous_Pokedex["Base_Total"].max())),
          color = 'black',
          edgecolor = 'white')

plot.xlabel('Base Total')
plot.ylabel('Frequency')
plot.title('Distribution of Base Totals')
plot.show()

### *The **Base Total** of the population is fairly multimodal, with a few outliers near the 800 range.*

## *Feature: HP*

In [None]:
plot.hist(continuous_Pokedex["HP"], 
          bins = math.ceil(math.sqrt(continuous_Pokedex["HP"].max())),
          color = 'black',
          edgecolor = 'white')

plot.xlabel('HP')
plot.ylabel('Frequency')
plot.title('Distribution of Health Points (HP)')
plot.show()

### *The **HP** of the population is unimodal with a positive skew.  It's arguably exponential as well with how aggressive the decline slopes off.  The mean of this sample population is less than the median.*

## *Feature: Defense*

In [None]:
plot.hist(continuous_Pokedex["Defense"], 
          bins = math.ceil(math.sqrt(continuous_Pokedex["Defense"].max())),
          color = 'black',
          edgecolor = 'white')

plot.xlabel('Defense')
plot.ylabel('Frequency')
plot.title('Distribution of Defense')
plot.show()

### *The **Defense** of the population is positively skewed, with the mean falling below the median.  This sample also shares similarities with an exponential distribution, although not as drastic as the previous sample.*

## *Feature: Attack*

In [None]:
plot.hist(continuous_Pokedex["Attack"], 
          bins = math.ceil(math.sqrt(continuous_Pokedex["Attack"].max())),
          color = 'black',
          edgecolor = 'white')

plot.xlabel('Attack')
plot.ylabel('Frequency')
plot.title('Distribution of Attack')
plot.show()

### *The **Attack** subset is the closest we've come to a normal distribution, although I'd still regard this set as negatively skewed.*

## *Feature: Speed*

In [None]:
plot.hist(continuous_Pokedex["Speed"], 
          bins = math.ceil(math.sqrt(continuous_Pokedex["Speed"].max())),
          color = 'black',
          edgecolor = 'white')

plot.xlabel('Speed')
plot.ylabel('Frequency')
plot.title('Distribution of Speed')
plot.show()

### *The **Speed** statistic is also nearing a normal distribution, and mostly unimodal.  I wouldn't confidently describe this set as skewed in either direction, but somewhere inbetween skewed right and normal.*

## *Feature: Sp_Attack*

In [None]:
plot.hist(continuous_Pokedex["Sp_Attack"], 
          bins = math.ceil(math.sqrt(continuous_Pokedex["Sp_Attack"].max())),
          color = 'black',
          edgecolor = 'white')

plot.xlabel('Sp_Attack')
plot.ylabel('Frequency')
plot.title('Distribution of Special Attack')
plot.show()

### *With the **Sp_Attack** statistic, we've returned to the skewed unimodal sets.  Once again, the central tendency of this set is below the median.*

## *Feature: Sp_Defense*

In [None]:
plot.hist(continuous_Pokedex["Sp_Defense"], 
          bins = math.ceil(math.sqrt(continuous_Pokedex["Sp_Defense"].max())),
          color = 'black',
          edgecolor = 'white')

plot.xlabel('Sp_Defense')
plot.ylabel('Frequency')
plot.title('Distribution of Special Defense')
plot.show()

### *The **Sp_Defense** has a more pronounced skew to the right, and knocking on the door of another exponential model.*

# **3. Identification of Data Quality Issues**

In [None]:
Data_Quality_Issues = pd.DataFrame({
  'Feature': ["Name", "Pokedex_ID", "Type_2"], 
  'Quality Issue':["Uninformative", "Uninformative", "Missing Values (48%)"],
  'Handing Strategy':["Remove", "Remove", "Imputation"],
})

Data_Quality_Issues

### *The **Name** and **ID**, depending on the scope of the analysis, aren't very informative.  Every instance of these features are unique, and serve only as identifiers to each Pokemon.  If I'm looking at an overall patterns in the set, I don't need necessarily need each Pokemon's ID.  By removing these categories, there's potentially an overall noise reduction in the dataset.*

### *To correct the missing values in **Type_2**, I would duplicate the feature values from the **Type_1** column of the same index.*

In [None]:
# Updating the quality solutions
revised_categorical_Pokedex = categorical_Pokedex[['Type_1', 'Type_2', 'Generation']]
revised_categorical_Pokedex['Type_2'].fillna(revised_categorical_Pokedex['Type_1'], inplace = True)
revised_categorical_Pokedex['Generation'] = revised_categorical_Pokedex['Generation'].astype(str)

# **4. Scatterplot Matrix**

In [None]:
sea.pairplot(continuous_Pokedex, diag_kind ='kde')

### *There appears to be a significantly common semi-linear grouping pattern throughout virtually every single metric.*

# **5. Visualizing Pairs of Categorical Features**

In [None]:
# revised_categorical_Pokedex = categorical_Pokedex[['Type_1', 'Type_2', 'Generation']]

# Build barplot
cross_tab = pd.crosstab(revised_categorical_Pokedex['Type_1'], revised_categorical_Pokedex['Generation'])
fig, ax = plot.subplots(figsize = (12, 4))
cross_tab.plot(kind = 'bar', stacked = False, ax = ax)
plot.xlabel('Types')
plot.ylabel('Frequency')
plot.title('Bar Plot of Type_1 and Generation')

# Show plot
plot.show()

In [None]:
# revised_categorical_Pokedex = categorical_Pokedex[['Type_1', 'Type_2', 'Generation']]

# Build barplot
cross_tab = pd.crosstab(revised_categorical_Pokedex['Type_2'], revised_categorical_Pokedex['Generation'])
fig, ax = plot.subplots(figsize = (12, 4))
cross_tab.plot(kind = 'bar', stacked = False, ax = ax)
plot.xlabel('Types')
plot.ylabel('Frequency')
plot.title('Bar Plot of Type_2 with Generation')

# Show plot
plot.show()

In [None]:
# revised_categorical_Pokedex = categorical_Pokedex[['Type_1', 'Type_2', 'Generation']]

# Build barplot
cross_tab = pd.crosstab(revised_categorical_Pokedex['Type_1'], revised_categorical_Pokedex['Type_2'])
fig, ax = plot.subplots(figsize = (12, 4))
cross_tab.plot(kind = 'bar', stacked = False, ax = ax)
plot.xlabel('Types')
plot.ylabel('Frequency')
plot.title('Bar Plot of Type_1 with Type_2')
ax.legend(loc='upper right', ncol = 9)

# Show plot
plot.show()

# **6. Visualizing Relationship Between a Categorical and Continuous Feature**

In [None]:
# Data prep
subset_cat_con_stacked = Pokedex[['Generation', 'Base_Total']]
x = subset_cat_con_stacked.index

# Build/customize bar plot
fig, ax = plot.subplots(figsize=(12, 5))
ax.bar(x, subset_cat_con_stacked["Generation"], label = 'Generation')
ax.bar(x, subset_cat_con_stacked["Base_Total"], bottom = subset_cat_con_stacked["Generation"], label = 'Base_Total')
ax.set_xlabel('Generation / Base Total')
ax.set_ylabel('Value')
ax.set_title('Generation and Base Total Stacked Bar Plot')
ax.legend()

# Stacked bar plot
plot.show()

## *'Generation' and 'Base Total' Stacked Barplot Analysis*

### *I wanted to the relationship between later generations and base power, to see if over the years the base stats have increased, or even decreased.  To my surprise, some of the highest stat based Pokemon were created in the mid-early to mid generations.*

# **7. Boxplot Visualizations**

In [None]:
# Data prep
subset_cat_con_boxplot = Pokedex[['Generation', 'Base_Total']]

# Build/customize boxplot
plot.figure(figsize = (14.5, 5))
plot.yticks([1, 2], ['Generation', 'Base Total'])
plot.boxplot(subset_cat_con_boxplot, vert = False)
plot.title('Generation vs Base Total Box plots')

# Show the plot
plot.show()

## *'Generation' and 'Base Total' Boxplot Analysis*

### *Due to the nature of how these features are catalogued, they aren't ideal factors to compare.  Without reformatting the data into other forms -- such as binning the base totals -- its difficult for me to infer anything useful.*

# **8. Covariance Matrix**

In [None]:
# continuous_Pokedex = Pokedex[["Base_Total", "HP", "Defense", "Attack", "Speed", "Sp_Attack", "Sp_Defense"]]
con_cov_matrix = pd.DataFrame.cov(continuous_Pokedex)
print(con_cov_matrix)

### *Similar to the inferences from the scatterplot matrix, there aren't any negative covariances, meaning these features tend to increase and decrease together.  These results support the earlier notion that there is a positive relationship with almost all of these pairs.*

# **9.Correlation Matrix**

In [None]:
# continuous_Pokedex = Pokedex[["Base_Total", "HP", "Defense", "Attack", "Speed", "Sp_Attack", "Sp_Defense"]]
con_corr_matrix = pd.DataFrame.corr(continuous_Pokedex)
print(con_corr_matrix)

### *The highest correlations are the pairs with the **Base_Total**.  Which tracks, because the higher the base stat pool of any Pokemon would yield higher values.  There also seems a be a correlation between special and normal attack and defense.  There is a correlation .47 between **Defense** and **Attack**, and there's a .51 correlation between **Sp_Defense** and **Sp_Attack**.  This could suggest that each Pokemon has a natural affinity towards normal (physical) means of defense and offense, or they have a special (magical) affinity in combat.*

# **10. Range Normalization**

In [None]:
# continuous_Pokedex = Pokedex[['Base_Total', 'HP', 'Defense', 'Attack', 'Speed', 'Sp_Attack', 'Sp_Defense']]
norm_before_continuous_Pokedex = continuous_Pokedex[['HP', 'Defense', 'Attack', 'Speed', 'Sp_Attack', 'Sp_Defense']]

def normalize(continuous_Pokedex_vector):
    norm_range = [0, 1]
    vector_min, vector_max = continuous_Pokedex_vector.min(), continuous_Pokedex_vector.max()
    return [((x - vector_min) / (vector_max - vector_min)) * (norm_range[1] - norm_range[0]) + norm_range[0] for x in continuous_Pokedex_vector]


norm_after_continuous_Pokedex = norm_before_continuous_Pokedex.apply(normalize)

### **Before Range Normalization**
*Note: only the 'head' of the DataFrames are shown due to the size (1602) of the combined poplulations.  If you wish to see the full samples, you can delete the '#' below commenting out the full dataframe displays.*

In [None]:
norm_before_continuous_Pokedex.head(10)
# norm_before_continuous_Pokedex

### **After Range Normalization**

In [None]:
norm_after_continuous_Pokedex.head(10)
# normal_continuous_Pokedex

### *I chose HP, Defense, Attack, Speed, Sp_Attack, and Sp_Defense.  All of these values are indicators of a strength in a specific trait.  If I normalize the values of each vector, then I can compare them effectively against stats from other vectors related to combat, instead of scaling their magnitude against values of the same subset.*

# **11. Binning**

### *Equal Width Binning of 'Base_Total'*

In [None]:
# continuous_Pokedex = Pokedex[["Base_Total", "HP", "Defense", "Attack", "Speed", "Sp_Attack", "Sp_Defense"]]
equal_bin_pokedex = continuous_Pokedex[['Base_Total']]

# number of bins and bin width
num_bins = 10

# equal width binning
equal_bin_pokedex['Equal Width Bin'] = pd.cut(equal_bin_pokedex['Base_Total'], bins = num_bins, precision = 2)

equal_bin_pokedex

### *Equal Frequency Binning of 'Attack'*

In [None]:
freq_bin_pokedex = continuous_Pokedex[['Attack']]
freq_bin_pokedex['Normalised'] = norm_after_continuous_Pokedex[['Attack']]

# qcut to perform frequency binning
freq_bin_pokedex['Equal Freq. Bin'] = pd.qcut(freq_bin_pokedex['Attack'], q = num_bins, labels = False)

freq_bin_pokedex

# **12. Undersampling**

In [7]:
type_1_values = categorical_Pokedex['Type_1'].value_counts()

type_1_freq = pd.DataFrame({
    'Type': type_1_values.index, 
    'Frequency': type_1_values.values
})

print(type_1_freq['Frequency'])

# type_1_freq

0     114
1     105
2      78
3      72
4      53
5      52
6      45
7      39
8      32
9      32
10     29
11     28
12     27
13     27
14     24
15     23
16     18
17      3
Name: Frequency, dtype: int64


In [4]:
print(type_1_freq['Type'])

0        water
1       normal
2        grass
3          bug
4      psychic
5         fire
6         rock
7     electric
8       poison
9       ground
10        dark
11    fighting
12       ghost
13      dragon
14       steel
15         ice
16       fairy
17      flying
Name: Type, dtype: object


### *The dataframe above shows the frequency of each variable from **Type_1**.  If we look at the feature type **water**, it appears in the set 114 times.  With a total population size of 801, this means that 14% of all resulting data is going to be from the **water** type.  Depending on the scope of the query, one variable impacting 14% of the results may distort the information, and give a disproportionate view of the population.  This may create a need to undersample the **water** type in an attempt to create an equal distribution of statistics, so that one value doesn't outweigh or distort the set.*

# **13. Oversampling**

In [39]:
type_1_values = categorical_Pokedex['Type_1'].value_counts()

type_1_freq = pd.DataFrame({
    'Type': type_1_values.index, 
    'Frequency': type_1_values.values
})

print(type_1_values.values[0])

114


In [57]:
type_1_values = categorical_Pokedex['Type_1'].value_counts()


type_1_freq = pd.DataFrame({
    'Type': type_1_values.index, 
    'Frequency': type_1_values.values
})

def log2(x):
    return math.log(x, 2)
    
def p(x, n):
    return x / n
    
def entropy(_D):
    levels_D = _D.value_counts()
    return -1 * sum([p(x, _D.size) * log2(p(x, _D.size)) for x in levels_D.values])

entropy(categorical_Pokedex['Type_1'])

3.880547553784042

### *Referencing the same dataset from **12.** **Undersampling**, if we were to examine the last type in the set **flying**, it has a frequency rate of 3.  That's a rate of .004%.  For all intents and purposes, the chance of the **flying** type occuring in a sample set is near impossible.  Depending on the scope of the query, this could potentially be a case where oversampling would be necessary to create an equally proportionate distribution.*