In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from matplotlib import style
import matplotlib.pyplot as plt
import random
import torch
from torch.autograd import Variable
import torchvision.transforms as transforms
import torchvision.datasets as dsets
from sklearn.model_selection import GridSearchCV
style.use("ggplot")

def plot(train_rolling_loss,dev_rolling_loss,train_rolling_acc,dev_rolling_acc):
    plt.figure(figsize=(15,10))
    plt.subplot(221)
    plt.plot(train_rolling_loss,label="training")
    plt.plot(dev_rolling_loss,label="dev")
    plt.title("Rolling Loss",loc='center')
    plt.legend(loc=1)
    plt.subplot(222)
    plt.plot(train_rolling_acc,label="training")
    plt.plot(dev_rolling_acc,label="dev")
    plt.title("Rolling Accuracy",loc='center')
    plt.legend(loc=0)
    plt.show()

In [6]:
## reading clean data
data = pd.read_csv('cleaned_6_Sep.csv')
data.head()

Unnamed: 0,Id,StageName,Status_Reason__c,RecordType.Name,RICE_Supported__c,CreatedDate,CloseDate,Actual_Close_Date__c,AccountId,Lead_Faculty__c,...,Industry,Industry_Sub_Type__c,Business_Type__c,Is_External__c,ParentId,CloseYear,CloseMonth,CreatedYear,CreatedMonth,Time_length
0,0062e000002Hc2PAAS,0,Customer No Longer Interested,Custom Education (MSPACE Included),RIC-BD&I,2019-11-07T00:59:46.000Z,8/31/2020,1,0012e000003AqeVAAS,0012e000002ZGfbAAG,...,Health,Health Care & Healthy Aging,Multinational / Other Large Corporate,External,0,2020,8,2019,11,1
1,0062e000002HFaaAAG,1,Post Award,Parent Grant,RIC-RE&D,2018-09-19T04:32:55.000Z,12/31/2049,1,0012e000003A6ElAAK,NotGiven,...,Public Administration,Public Administration,Government: Australia: Federal,External,1,2049,12,2018,9,31
2,0062e000002HFabAAG,1,Post Award,Parent Grant,RIC-RE&D,2018-09-19T04:32:19.000Z,6/22/2020,1,0012e000003A6ElAAK,NotGiven,...,Public Administration,Public Administration,Government: Australia: Federal,External,1,2020,6,2018,9,2
3,0062e000002HFaiAAG,1,Post Award,Parent Grant,Not supported,2019-02-13T19:41:22.000Z,6/16/2020,1,0012e000003A6bnAAC,NotGiven,...,Health,Health Care & Healthy Aging,Government: Australia: Federal,External,0,2020,6,2019,2,1
4,0062e000002HFalAAG,1,Post Award,Parent Grant,RIC-RE&D,2017-04-07T01:27:08.000Z,6/22/2020,1,0012e000003A6osAAC,NotGiven,...,Public Administration,Public Administration,Government: Australia: Federal,External,1,2020,6,2017,4,3


In [7]:
data.describe()

Unnamed: 0,StageName,Actual_Close_Date__c,Parent_Opportunity__c,ParentId,CloseYear,CloseMonth,CreatedYear,CreatedMonth,Time_length
count,5516.0,5516.0,5516.0,5516.0,5516.0,5516.0,5516.0,5516.0,5516.0
mean,0.385968,1.0,0.111675,0.166062,2018.988397,8.508883,2017.892857,6.304569,1.09554
std,0.486867,0.0,0.314995,0.37217,1.243314,3.297367,1.322768,3.183033,1.364001
min,0.0,1.0,0.0,0.0,2015.0,1.0,2015.0,1.0,-1.0
25%,0.0,1.0,0.0,0.0,2019.0,6.0,2017.0,4.0,0.0
50%,0.0,1.0,0.0,0.0,2019.0,11.0,2018.0,6.0,1.0
75%,1.0,1.0,0.0,0.0,2019.0,11.0,2019.0,9.0,2.0
max,1.0,1.0,1.0,1.0,2050.0,12.0,2020.0,12.0,31.0


In [16]:
len(data.columns)

23

In [11]:
## one-hot-encoding 11

a = pd.get_dummies(data["Status_Reason__c"], prefix='Status_Reason')
b = pd.get_dummies(data["RecordType.Name"], prefix='RecordType')
c = pd.get_dummies(data["RICE_Supported__c"], prefix='RICE_Supported')
d = pd.get_dummies(data["AccountId"], prefix='AccountId')
e = pd.get_dummies(data["Lead_Faculty__c"], prefix='Lead_Faculty')
e1 = pd.get_dummies(data["Lead_School__c"], prefix='Lead_School')
f = pd.get_dummies(data["RecordType.Name.1"], prefix='RecordType_ind')
g = pd.get_dummies(data["Industry"], prefix='Industry')
g1 = pd.get_dummies(data["Industry_Sub_Type__c"], prefix='Industry_Sub_Type')
h = pd.get_dummies(data["Business_Type__c"], prefix='Business_Type')
#i = pd.get_dummies(data["RecordType"], prefix='RecordType_mixed')
j = pd.get_dummies(data["Is_External__c"], prefix='Is_External')

org = data[["StageName","Actual_Close_Date__c","Parent_Opportunity__c","ParentId"]]

scale1 = data["CloseYear"].div(2050)
scale2 = data["CloseMonth"].div(12)
scale3 = data["CreatedYear"].div(2020)
scale4 = data["CreatedMonth"].div(12)
scale5 = data[" Time_length"].div(31)

df = pd.concat([org,scale1,scale2,scale3,scale4,scale5,a,b,c,d,e,e1,f,g,g1,h,j], axis=1, ignore_index=True)

In [15]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2263,2264,2265,2266,2267,2268,2269,2270,2271,2272
0,0,1,0,0,0.985366,0.666667,0.999505,0.916667,0.032258,0,...,0,0,1,0,0,0,0,0,1,0
1,1,1,0,1,0.999512,1.000000,0.999010,0.750000,1.000000,0,...,0,0,0,0,0,0,0,0,1,0
2,1,1,0,1,0.985366,0.500000,0.999010,0.750000,0.064516,0,...,0,0,0,0,0,0,0,0,1,0
3,1,1,0,0,0.985366,0.500000,0.999505,0.166667,0.032258,0,...,0,0,0,0,0,0,0,0,1,0
4,1,1,0,1,0.985366,0.500000,0.998515,0.333333,0.096774,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5511,1,1,1,1,0.985366,1.000000,1.000000,0.583333,0.000000,0,...,0,0,0,0,0,0,0,0,1,0
5512,1,1,0,0,0.985366,0.583333,1.000000,0.583333,0.000000,0,...,0,0,0,0,0,0,0,1,1,0
5513,1,1,0,0,0.985366,0.583333,1.000000,0.583333,0.000000,0,...,0,0,0,0,0,0,0,1,1,0
5514,1,1,0,0,0.985854,0.083333,1.000000,0.666667,0.032258,0,...,0,0,0,0,0,0,0,0,1,0
