### **Data Generation - create a notebook (generate_dataset.ipynb) that generates a new table with 500 rows and each row contains a set of summary attributes that are extracted from the respective portfolio file.**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
%cd '/content/gdrive/MyDrive/Colab Notebooks/BigDataAssignment6'

/content/gdrive/MyDrive/Colab Notebooks/BigDataAssignment6


In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
def read_portfolio(portfolio_num):

   # Load the data into a Pandas data frame
    filename = 'portfolios/portfolio_' + str(portfolio_num) + '.csv'
    portfolio = pd.read_csv(filename)

    # portfolio number
    ID = str(portfolio_num)

    # Calculating the relative frequency of 'Vehicle_Anti_Theft_Device'
    VATD_NA = portfolio[portfolio['Vehicle_Anti_Theft_Device'] == 'Not Applicable'].shape[0] / portfolio.shape[0]
    VATD_Passive = portfolio[portfolio['Vehicle_Anti_Theft_Device'] == 'Passive Disabling-Vehicle Recovery'].shape[0] / portfolio.shape[0]
    VATD_Alarm = portfolio[portfolio['Vehicle_Anti_Theft_Device'] == 'Alarm Only'].shape[0] / portfolio.shape[0]
    VATD_Active = portfolio[portfolio['Vehicle_Anti_Theft_Device'] == 'Active Disabling'].shape[0] / portfolio.shape[0]


    # Calculate the mean and standard deviation of 'Driver_Minimum_Age'
    DMA_mean = np.mean(portfolio['Driver_Minimum_Age'])
    DMA_std = np.std(portfolio['Driver_Minimum_Age'])

    # Calculate the mean and standard deviation of 'Vehicle_Age_In_Years'
    VAY_mean = np.mean(portfolio['Vehicle_Age_In_Years'])
    VAY_std = np.std(portfolio['Vehicle_Age_In_Years'])

    # Calculate the mean and standard deviation of 'Annual_Premium'
    AP_mean = np.mean(portfolio['Annual_Premium'])
    AP_std = np.std(portfolio['Annual_Premium'])

    # Calculate the natural logarithm of loss ratio
    ln_LR = np.log(np.sum(portfolio['Loss_Amount']) / np.sum(portfolio['Annual_Premium']))

    # Return a dictionary with the calculated values
    return {'ID': ID,
            'VATD_NA': VATD_NA,
            'VATD_Passive': VATD_Passive,
            'VATD_Alarm': VATD_Alarm,
            'VATD_Active': VATD_Active,
            'DMA_mean': DMA_mean,
            'DMA_std': DMA_std,
            'VAY_mean': VAY_mean,
            'VAY_std': VAY_std,
            'AP_mean': AP_mean,
            'AP_std': AP_std,
            'ln_LR': ln_LR}

**Create an empty data frame to collect the summary attributes of all the portfolios.**

In [None]:
df_atr_summary = pd.DataFrame(columns=['ID',
                                       'VATD_NA',
                                       'VATD_Passive',
                                       'VATD_Alarm',
                                       'VATD_Active',
                                       'DMA_mean',
                                       'DMA_std',
                                       'VAY_mean',
                                       'VAY_std',
                                       'AP_mean',
                                       'AP_std',
                                       'ln_LR'])

**Write a for loop that goes from 1 to 500 and calls the read_portfolio function. Append the results of the function to the new data frame.  (You might have to ignore the index here)**

In [None]:
for i in range(1, 501):
    df_atr_summary = df_atr_summary.append(read_portfolio(i), ignore_index=True)

In [None]:
df_atr_summary.set_index('ID', inplace=True)

**Export this data into a file called summary_portfolios.csv**

In [None]:
df_atr_summary.to_csv('summary_portfolios.csv', index=True)