In [1]:
import pandas as pd

df = pd.read_csv(r'C:\Users\AARUSHI TANDON\Downloads\Shark Tank US dataset.csv')
df.head()


Unnamed: 0,Startup Name,Industry,Business Description,Original Ask Amount,Got Deal,Total Deal Amount
0,AvaTheElephant,Health/Wellness,Ava The Elephant - Baby and Child Care,50000.0,1.0,50000.0
1,MrTod'sPieFactory,Food and Beverage,Mr. Tod's Pie Factory - Specialty Food,460000.0,1.0,460000.0
2,Wispots,Business Services,Wispots - Consumer Services,1200000.0,0.0,
3,CollegeFoxesPackingBoxes,Lifestyle/Home,College Foxes Packing Boxes - Consumer Services,250000.0,0.0,
4,IonicEar,Technology/Software,Ionic Ear - Novelties,1000000.0,0.0,


In [10]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    return text

df['text'] = df['Business Description'].astype(str)  # just using one column for now



In [11]:
# Drop rows where 'Business Description' or 'Got Deal' is NaN
df = df.dropna(subset=['Business Description', 'Got Deal'])


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)  # limit vocab size for speed
X = vectorizer.fit_transform(df['text'])


In [13]:
y = df['Got Deal']


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)



In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.5868055555555556
              precision    recall  f1-score   support

         0.0       0.43      0.11      0.18       115
         1.0       0.60      0.90      0.72       173

    accuracy                           0.59       288
   macro avg       0.52      0.51      0.45       288
weighted avg       0.54      0.59      0.51       288



In [16]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

df['text'] = df['text'].apply(clean_text)


[nltk_data] Downloading package stopwords to C:\Users\AARUSHI
[nltk_data]     TANDON\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\AARUSHI
[nltk_data]     TANDON\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(clean_text)


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
X = vectorizer.fit_transform(df['text'])


In [18]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)


In [19]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.53125
              precision    recall  f1-score   support

         0.0       0.40      0.37      0.38       115
         1.0       0.60      0.64      0.62       173

    accuracy                           0.53       288
   macro avg       0.50      0.50      0.50       288
weighted avg       0.52      0.53      0.53       288



In [21]:
print(df['Got Deal'].value_counts(normalize=True))


Got Deal
1.0    0.611691
0.0    0.388309
Name: proportion, dtype: float64


In [22]:
vectorizer = TfidfVectorizer(max_features=1000)


In [35]:
for c in [0.001, 0.1, 1, 10, 100]:
    model = LogisticRegression(max_iter=1000, C=c)
    model.fit(X_train, y_train)
    print(f"C={c} -> Accuracy: {model.score(X_test, y_test)}")


C=1e-05 -> Accuracy: 0.6111111111111112
C=0.1 -> Accuracy: 0.6111111111111112
C=1 -> Accuracy: 0.6076388888888888
C=10 -> Accuracy: 0.53125
C=100 -> Accuracy: 0.5277777777777778


In [24]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         0.0       0.40      0.37      0.38       115
         1.0       0.60      0.64      0.62       173

    accuracy                           0.53       288
   macro avg       0.50      0.50      0.50       288
weighted avg       0.52      0.53      0.53       288



In [25]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
model.fit(X_train_res, y_train_res)


In [26]:
probs = model.predict_proba(X_test)[:,1]
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_test, probs)
# plot or pick best threshold


In [36]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

# Drop NaNs
df = df.dropna(subset=['Business Description', 'Got Deal', 'Original Ask Amount', 'Industry'])

# Clean text function as before
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    return text

df['text'] = df['Business Description'].astype(str).apply(clean_text)

# Features and target
X = df[['text', 'Original Ask Amount', 'Industry']]
y = df['Got Deal']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=1000), 'text'),
        ('industry', OneHotEncoder(handle_unknown='ignore'), ['Industry']),
    ],
    remainder='passthrough'  # pass 'Original Ask Amount' as is
)

# Logistic regression pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, C=0.01))
])

# Train
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.625
              precision    recall  f1-score   support

         0.0       1.00      0.04      0.07       112
         1.0       0.62      1.00      0.77       176

    accuracy                           0.62       288
   macro avg       0.81      0.52      0.42       288
weighted avg       0.77      0.62      0.49       288



In [None]:
import os
print(os.getcwd())


['C:/Users/AARUSHI TANDON/Documents/startup_pitch_model.pkl']

In [1]:
import joblib
import os

path = os.getcwd()  # current working directory
filename = os.path.join(path, 'startup_pitch_model.pkl')

joblib.dump(pipeline, filename)
print(f"Model saved at: {filename}")


NameError: name 'pipeline' is not defined

Current directory: C:\Users\AARUSHI TANDON\Desktop
Attempted to save file at: C:\Users\AARUSHI TANDON\Desktop\test_save.pkl
File exists? True
