# Synthetic-data-generator



In this section, you will generate synthetic data that will be used to train the linear learner models.  The data generated consists of 6 numerical features - the customer age, gender, location as well as the monthly charges for their service and their contract Type. Finally how many times they have contacted the Support Calls and finally charn if the customer has left the contract or not.  

In [48]:
import numpy as np
import pandas as pd
import json
import datetime
import time
import boto3
import sagemaker
import os

from time import gmtime, strftime
from random import choice

from sklearn.model_selection import train_test_split

In [49]:
NUM_CUSTOMERS_PER_PROVIDERS = 1000
PROVIDERS  = ['AnyCompany', 'Advanced_Tenant1',   'Advanced_Tenant2',    'Premium_Tenant1',   'Premium_Tenant2']

In [50]:
def gen_yes_no():
    """Generate values (y/n) for categorical features"""
    answer = np.random.randint(0, 2)
    return answer

In [51]:
def gen_random_customer():
    """Generate a row of data (customer churn information)"""
    customer = {
             'Churn': gen_yes_no(), 
             'Age':  np.random.randint(18, 85),
             'Gender': np.random.randint(0, 2), #0: Male, #1: Female, and #2 Others
             'Location': np.random.randint(0,10), #0: New York, #1: Chicago, #2: Los Angeles, #3: Boston, #4: Seattle, #5: Austin, #6: Las Vegas, #7: Huston, #8:San Diego, #9: Philadelphia, and #10: San Fransisco  
             'MonthlyCharges': np.random.randint(40, 140),
             'ContractType':    np.random.randint(0, 3), #0:1-year, #1:Month-to-Month, #2: 2-year 
             'SupportCalls':   np.random.randint(0, 10)
            }
    
    return [customer['Churn'],
            customer['Age'],   
            customer['Gender'], 
            customer['Location'], 
            customer['MonthlyCharges'], 
            customer['ContractType'],    
            customer['SupportCalls'],  
           ]

In [52]:
def gen_customers(num_customers):
    """Generate customer churn dataset"""
    customer_list = []
    
    for _ in range(num_customers):
        customer_list.append(gen_random_customer())
        
    df = pd.DataFrame(
        customer_list, 
        columns=[
            'CHURN',
            'AGE',    
            'GENDER',  
            'LOCATION',            
            'MONTHLY_CHARGE',
            'CONTRACT_TYPE',
            'SUPPORT_CALLS',
        ]
    )
    return df

In [53]:
def save_data_locally(provider, train, val, test): 
    """Save the customer churn data locally"""
    os.makedirs('data/{0}/train'.format(provider), exist_ok=True)
    train.to_csv('data/{0}/train/train.csv'.format(provider), sep=',', header=False, index=False)
    
    os.makedirs('data/{0}/validation'.format(provider), exist_ok=True)
    val.to_csv('data/{0}/validation/validation.csv'.format(provider), sep=',', header=False, index=False)

    
    os.makedirs('data/{0}/test'.format(provider), exist_ok=True)
    test.to_csv('data/{0}/test/test.csv'.format(provider), sep=',', header=False, index=False) 
    
    

In [54]:
#Generate customer churn data for multiple providers.

for provider in PROVIDERS:
    customers = gen_customers(NUM_CUSTOMERS_PER_PROVIDERS)
    
    #Spliting data into train and test in 90:10 ratio
    #Not splitting the train data into train and val because its not preprocessed yet
    train_val, test = train_test_split(customers, test_size=0.1)
    train, val = train_test_split(train_val, test_size=0.1)
    
    save_data_locally(provider, train, val, test)


In [55]:
#Shows the first few lines of data.
customers.head()

Unnamed: 0,CHURN,AGE,GENDER,LOCATION,MONTHLY_CHARGE,CONTRACT_TYPE,SUPPORT_CALLS
0,1,45,1,3,109,2,3
1,1,51,1,0,134,2,3
2,0,41,0,3,59,0,8
3,0,19,0,0,83,2,1
4,0,60,1,1,50,2,1
