In [33]:
import numpy as np
import sklearn 
import pandas as pd
import xgboost as xg
import altair as alt
import nbformat

In [2]:
sharktank = pd.read_csv('Shark Tank US dataset.csv',  encoding= 'unicode_escape')

In [3]:
sharktank['Industry'].value_counts()

Food and Beverage              202
Fashion / Beauty               167
Lifestyle / Home               140
Children / Education            93
Fitness / Sports / Outdoors     93
Software / Tech                 60
Pet Products                    43
Health / Wellness               36
Business Services               29
Media / Entertainment           23
Lifestyle/Home                  19
Automotive                      12
Health/Wellness                 12
Green/CleanTech                 11
Travel                          11
Fashion/Beauty                  11
Uncertain / Other                9
Children/Education               9
Fitness/Sports/Outdoors          8
Software/Tech                    3
Name: Industry, dtype: int64

In [4]:
list(sharktank.columns)

['Season Number',
 'Episode Number',
 'Pitch Number',
 'Startup Name',
 'Industry',
 'Business Description',
 'Pitchers Gender',
 'Pitchers State',
 'Pitchers City',
 'Entrepreneur Names',
 'Company Website',
 'Multiple Entrepreneurs',
 'Original Ask Amount',
 'Original Offered Equity',
 'Valuation Requested',
 'Got Deal',
 'Total Deal Amount',
 'Total Deal Equity',
 'Deal Valuation',
 'Number of sharks in deal',
 'Investment Amount Per Shark',
 'Royalty Deal',
 'Loan',
 'Barbara Corcoran Investment Amount',
 'Mark Cuban Investment Amount',
 'Lori Greiner Investment Amount',
 'Robert Herjavec Investment Amount',
 'Daymond John Investment Amount',
 'Kevin O Leary Investment Amount',
 'Guest Investment Amount',
 'BarbaraCorcoran Present',
 'MarkCuban Present',
 'LoriGreiner Present',
 'RobertHerjavec Present',
 'DaymondJohn Present',
 'KevinOLeary Present',
 'KevinHarrington Present',
 'Guest Name',
 'Notes']

In [5]:
kevin = sharktank.loc[sharktank['KevinOLeary Present'] == 1]
barbara = sharktank.loc[sharktank['BarbaraCorcoran Present'] == 1]
robert = sharktank.loc[sharktank['RobertHerjavec Present'] == 1]
daymond = sharktank.loc[sharktank['DaymondJohn Present'] == 1]
kevinharrington = sharktank.loc[sharktank['KevinHarrington Present'] == 1]
lori = sharktank.loc[sharktank['LoriGreiner Present'] == 1]
markcuban = sharktank.loc[sharktank['MarkCuban Present'] == 1]

In [25]:

alt.renderers.enable('default')
chart = alt.Chart(sharktank).mark_bar().encode(
    x = 'Industry:O',
    y = 'count(Industry):Q'
).show()



Displaying chart at http://localhost:51995/


KeyboardInterrupt: 

In [26]:
sharktank['Pitchers Gender'].value_counts()

Male          584
Female        245
Mixed Team    162
Name: Pitchers Gender, dtype: int64

In [30]:
gotdeal = sharktank['Got Deal'].value_counts()

In [31]:
chart = alt.Chart(sharktank).mark_bar().encode(
    x = 'Got Deal:O',
    y = 'count(Got Deal):Q'
).show()

Displaying chart at http://localhost:51995/


KeyboardInterrupt: 

In [47]:


sharktank['Season Number'] = sharktank['Season Number'].astype(pd.Int32Dtype())
sharktank['Episode Number'] = sharktank['Episode Number'].astype(pd.Int32Dtype())
sharktank['Pitch Number'] = sharktank['Pitch Number'].astype(pd.Int32Dtype())

sharktank['Startup Name'] = sharktank['Startup Name'].astype(str)
sharktank['Industry'] = sharktank['Industry'].astype(str)
sharktank['Business Description'] = sharktank['Business Description'].astype(str)

sharktank['Multiple Entrepreneurs'] = sharktank['Multiple Entrepreneurs'].astype(pd.Int32Dtype())



In [37]:
# Calculate correlation matrix
corr = sharktank.corr()

# Reshape correlation matrix into tidy format
corr = corr.stack().reset_index()
corr.columns = ['var1', 'var2', 'corr']

# Create correlation matrix plot using Altair
chart = alt.Chart(corr).mark_rect().encode(
    x='var1:N',
    y='var2:N',
    color='corr:Q'
)

# Add text to the correlation matrix plot
text = chart.mark_text(baseline='middle').encode(
    text=alt.Text('corr:Q', format='.2f'),
    color=alt.condition(
        alt.datum.corr > 0.5,
        alt.value('white'),
        alt.value('black')
    )
)

(chart + text).configure_view(stroke=None)

chart





In [58]:
linear_sharktank = sharktank.drop(['Got Deal', 'Startup Name',
 'Industry',
 'Business Description',
 'Pitchers Gender',
 'Pitchers State',
 'Pitchers City',
 'Entrepreneur Names',
 'Company Website','Notes','Guest Name'], axis=1)

linear_sharktank = linear_sharktank.replace(',','', regex=True)
linear_sharktank = linear_sharktank.astype(float)

In [63]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
X = linear_sharktank
y = sharktank['Got Deal']
imputer = SimpleImputer()
X = imputer.fit_transform(X)

X_train, X_test,  y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

In [64]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [67]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
lr.fit(X_train, y_train)

pred = lr.predict(X_test)


print('Mean absolute error:', mean_absolute_error(y_test, pred))
print('MSE:', mean_squared_error(y_test, pred))
print('sqrt MSE:', np.sqrt(mean_squared_error(y_test, pred)))


Mean absolute error: 0.4916767691422439
MSE: 0.26998585150854787
sqrt MSE: 0.5196016277000562
