In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline

In [None]:
# Distribution graphs (histogram/bar graph) of column data
def plot_per_col_dist(df, n_graph_shown, n_graph_per_row):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    n_rows, n_cols = df.shape
    col_names = list(df)
    n_graph_row = (n_cols + n_graph_per_row - 1) / n_graph_per_row
    plt.figure(num = None, figsize = (6 * n_graph_per_row, 8 * n_graph_row), dpi = 80, facecolor = 'w', edgecolor = 'k')
    
    for i in range(min(n_cols, n_graph_shown)):
        plt.subplot(n_graph_row, n_graph_per_row, i + 1)
        col_df = df.iloc[:, i]
       
        if (not np.issubdtype(type(col_df.iloc[0]), np.number)):
            valueCounts = col_df.value_counts()
            valueCounts.plot.bar()
        else:
            col_df.hist()
        
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{col_names[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()


In [None]:
# Correlation matrix
def plot_corr_mat(df, g_width):
    file_name = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    
    plt.figure(num=None, figsize=(g_width, g_width), dpi=80, facecolor='w', edgecolor='k')
    correlation_mat = plt.matshow(corr, fignum = 1)   
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(correlation_mat)
    plt.title(f'Correlation Matrix for {file_name}', fontsize=15)
    plt.show()

In [None]:
# Scatter and density plots
def plot_scatter_mat(df, plot_size, text_size):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    ''' removing nulll values '''
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    col_names = list(df)
    
    if len(col_names) > 10: # reduce the number of columns for matrix plot
        col_names = col_names[:10]
        print(f'Decreasing number of columns to 10 for better visualization. \n The selected columns are: {col_names}')
    df = df[col_names]
    
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plot_size, plot_size], diagonal='kde')
    corrs = df.corr().values
    
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=text_size)
    
    plt.suptitle('Scatter and Density Plot')
    plt.show()

In [None]:
''' reading 1st file '''
n_rows = 1000
df_1 = pd.read_csv('Data/dia_3.csv')
df_1.dataframeName = 'dia_3.csv'

In [None]:
''' shape of data in 1st file'''
df_1.shape

In [None]:
''' displaying 1st fiver rows '''
df_1.head()

In [None]:
plot_corr_mat(df_1, 8)

In [None]:
''' reading 2nd file '''
n_rows = 1000

df_2 = pd.read_csv('Data/dia_t.csv', nrows=n_rows)
df_2.dataFrameName = 'dia_t.csv'

In [None]:
''' shape of data in 2nd file '''
df_2.shape

In [None]:
''' displaying 1st 5 rows'''
df_2.head()

In [None]:
plot_per_col_dist(df_2, 10, 5)

In [None]:
''' reading 3rd file '''
n_rows = 1000
df_3 = pd.read_csv('Data/diagn_title.csv', nrows=n_rows)
df_3.dataFrameName = 'diagn_title.csv'

In [None]:
''' displaying first five rows '''
df_3.head()

In [None]:
plot_per_col_dist(df_3, 10, 5)