In [22]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
data = pd.read_csv('102320_duplicates.csv')
pd.set_option('display.max_rows', None)


In [23]:
# Sets Form Date to datetime format; "Can only use .dt accessor
# with datetimelike values" (that's why form_date needs to be converted
# to datetime first, even though it looks redundant)
data['form_date']= pd.to_datetime(data['form_date'])
# convert to dt
data['form_date']= data['form_date'].dt.date
# Creating "count" variable, count of projects by day
count = data.groupby(['form_date']).size().sort_values(ascending=False)

In [24]:
# Casts count to a dataframe
count = pd.DataFrame(data = count)
# Resets index
count.reset_index(inplace=True)
# Renames "0" column
count = count.rename(columns = {0: "count"})

count['count'] = count['count'].fillna(0)


count.head()

Unnamed: 0,form_date,count
0,2020-06-23,127
1,2020-05-20,119
2,2020-10-22,115
3,2020-06-10,112
4,2020-06-29,109


In [25]:
# creates variable "rev"; sum of gross amount by form date and filtered by "stage" == "Closed - Won"
rev = data[data['stage'] == 'Closed - Won'].groupby('form_date')['gross_amount'].sum()
# casts rev to a dataframe
rev = pd.DataFrame(data = rev)
# resets index
rev.reset_index(inplace = True)
# casts gross_amount to int64
rev['gross_amount'] = rev['gross_amount'].astype(np.int64)
# fills nans
rev['gross_amount'] = rev['gross_amount'].fillna(0)

rev.head()

Unnamed: 0,form_date,gross_amount
0,2020-02-18,49500
1,2020-02-19,38456
2,2020-02-20,158600
3,2020-02-21,12340
4,2020-02-24,66215


In [27]:
# Redefine count and rev for clarity
left = count
right = rev
# Joining count and rev on form_date
df = left.merge(right, on='form_date', how='left')

df['gross_amount'] = df['gross_amount'].fillna(0)
df

Unnamed: 0,form_date,count,gross_amount
0,2020-06-23,127,684180.0
1,2020-05-20,119,431375.0
2,2020-10-22,115,801439.0
3,2020-06-10,112,468586.0
4,2020-06-29,109,651787.0
5,2020-06-30,107,458465.0
6,2020-06-25,107,973934.0
7,2020-06-24,105,375569.0
8,2020-04-28,102,1325582.0
9,2020-04-29,102,370377.0


In [28]:
x = df['count'].to_numpy().reshape((-1, 1))
y = df['gross_amount'].to_numpy()

In [29]:
model = LinearRegression()

In [30]:
model.fit(x, y)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [31]:
model = LinearRegression().fit(x, y)


In [32]:
r_sq = model.score(x, y)
print('coefficient of determination:', r_sq)

coefficient of determination: 0.3728110517027784


In [33]:
print('intercept:', model.intercept_)

intercept: -5549.810724437004


In [34]:
print('slope:', model.coef_)

slope: [7009.0170644]


In [40]:
obj = np.array([46, 120]).reshape((-1, 1))

In [41]:
y_pred = model.predict(obj)

In [42]:
print('predicted response:', y_pred, sep='\n')

predicted response:
[316864.97423814 835532.23700401]
