In [8]:
import gradio as gr
import pandas as pd
import joblib

# 1. üì¶ SMOTE Logistic Regression .pkl —Ñ–∞–π–ª—É—É–¥—ã–≥ –∞—á–∞–∞–ª–∞—Ö
model = joblib.load("startup_model_logreg_smote.pkl")
scaler = joblib.load("scaler_logreg_smote.pkl")
label_encoders = joblib.load("label_encoders_logreg_smote.pkl")
feature_order = joblib.load("feature_order_logreg_smote.pkl")

# 2. üß† –¢–∞–∞–º–∞–≥ –≥–∞—Ä–≥–∞—Ö —Ñ—É–Ω–∫—Ü
def predict(state, category, funding, relationships, milestones, participants, vc, angel, top500):
    features = {col: 0 for col in feature_order}

    # –ú—É–∂
    if state in label_encoders['state_code'].classes_:
        features['state_code'] = label_encoders['state_code'].transform([state])[0]
    features[f"is_{state}" if f"is_{state}" in feature_order else "is_otherstate"] = 1

    # –°–∞–ª–±–∞—Ä
    if category in label_encoders['category_code'].classes_:
        features['category_code'] = label_encoders['category_code'].transform([category])[0]
    features[f"is_{category}" if f"is_{category}" in feature_order else "is_othercategory"] = 1

    # –¢–æ–æ–Ω –±–æ–ª–æ–Ω –ª–æ–≥–∏–∫ —Ç–∞–ª–±–∞—Ä—É—É–¥
    features['funding_total_usd'] = funding * 1_000_000
    features['relationships'] = relationships
    features['milestones'] = milestones
    features['avg_participants'] = participants
    features['has_VC'] = int(vc)
    features['has_angel'] = int(angel)
    features['is_top500'] = int(top500)

    # Default-required
    features['closed_at'] = 0
    features['status'] = 0

    df = pd.DataFrame([features])[feature_order]
    df_scaled = scaler.transform(df)
    prob = model.predict_proba(df_scaled)[0]
    pred = model.predict(df_scaled)[0]

    # ‚úÖ –ó”©–≤–ª”©–º–∂—Ç—ç–π —Ö–∞—Ä–∏—É –±—É—Ü–∞–∞—Ö
    if pred == 1:
        msg = f"‚úÖ –¢–∞–Ω–∞–π —Å—Ç–∞—Ä—Ç–∞–ø –∞–º–∂–∏–ª—Ç—Ç–∞–π –±–æ–ª–æ—Ö –º–∞–≥–∞–¥–ª–∞–ª: {prob[1]:.2%}.\n\nüöÄ –ó”©–≤–ª”©–º–∂: –ò–ª“Ø“Ø milestone —Ö—ç—Ä—ç–≥–∂“Ø“Ø–ª–∂, —Ö”©—Ä”©–Ω–≥”© –æ—Ä—É—É–ª–∞–ª—Ç–∞–∞ —Ç–æ–≥—Ç–≤–æ—Ä—Ç–æ–π –Ω—ç–º—ç–≥–¥“Ø“Ø–ª—ç—ç—Ä—ç–π."
    else:
        msg = f"‚ùå –¢–∞–Ω–∞–π —Å—Ç–∞—Ä—Ç–∞–ø –∞–º–∂–∏–ª—Ç–≥“Ø–π –±–æ–ª–æ—Ö –º–∞–≥–∞–¥–ª–∞–ª: {prob[0]:.2%}.\n\nüìâ –ó”©–≤–ª”©–º–∂: VC —ç—Å–≤—ç–ª angel —Ö”©—Ä”©–Ω–≥”© –æ—Ä—É—É–ª–∞–≥—á —Ç–∞—Ç–∞—Ö, –±–∞–≥–∏–π–Ω –±“Ø—Ç—Ü–∏–π–≥ –±—ç—Ö–∂“Ø“Ø–ª—ç—Ö —à–∞–∞—Ä–¥–ª–∞–≥–∞—Ç–∞–π –±–∞–π–∂ –º–∞–≥–∞–¥–≥“Ø–π."

    return msg

# 3. üåê Gradio chatbot-style interface
with gr.Blocks(title="Startup Chatbot (LogReg + SMOTE)") as chatbot:
    gr.Markdown("## ü§ñ –°—Ç–∞—Ä—Ç–∞–ø –ê–º–∂–∏–ª—Ç—ã–Ω –¢–∞–∞–º–∞–≥–ª–∞–≥—á –ß–∞—Ç–±–æ—Ç (Logistic Regression + SMOTE)")
    gr.Markdown("–¢–∞ –∞—Å—É—É–ª—Ç–∞–¥ —Ö–∞—Ä–∏—É–ª–∞–∞–¥ ML –∑–∞–≥–≤–∞—Ä—ã–Ω —Ç–∞–∞–º–∞–≥ + –∑”©–≤–ª”©–º–∂”©”© –∞–≤–∞–∞—Ä–∞–π.")

    with gr.Row():
        state = gr.Dropdown(["CA", "NY", "TX", "MA", "otherstate"], label="1. –ê–ª—å –º—É–∂–∏–¥ –±–∞–π—Ä–ª–∞–¥–∞–≥ –≤—ç?")
        category = gr.Dropdown(["biotech", "software", "web", "othercategory"], label="2. –Ø–º–∞—Ä —Å–∞–ª–±–∞—Ä—Ç –∞–∂–∏–ª–ª–∞–¥–∞–≥ –≤—ç?")

    funding = gr.Slider(0, 100, step=1, label="3. –•”©—Ä”©–Ω–≥”© (—Å–∞—è USD)", value=1)
    relationships = gr.Slider(0, 20, step=1, label="4. Co-founder –±–æ–ª–æ–Ω —Ö–∞—Ä–∏–ª—Ü–∞–∞–Ω—ã —Ç–æ–æ", value=3)
    milestones = gr.Slider(0, 10, step=1, label="5. Milestone-–∏–π–Ω —Ç–æ–æ", value=2)
    participants = gr.Slider(0, 10, step=1, label="6. –î—É–Ω–¥–∞–∂ —Ö”©—Ä”©–Ω–≥”© –æ—Ä—É—É–ª–∞–≥—á–∏–π–Ω —Ç–æ–æ", value=4)

    vc = gr.Checkbox(label="7. VC —Ö”©—Ä”©–Ω–≥”© –æ—Ä—É—É–ª–∞–≥—á—Ç–∞–π —é—É?")
    angel = gr.Checkbox(label="8. Angel —Ö”©—Ä”©–Ω–≥”© –æ—Ä—É—É–ª–∞–≥—á—Ç–∞–π —é—É?")
    top500 = gr.Checkbox(label="9. Top 500-–¥ –±–∞–≥—Ç—Å–∞–Ω —É—É?")

    btn = gr.Button("üìä –¢–∞–∞–º–∞–≥ –≥–∞—Ä–≥–∞—Ö")
    result = gr.Textbox(label="üß† –¢–∞–∞–º–∞–≥ –±–∞ –ó”©–≤–ª”©–º–∂")

    btn.click(fn=predict, inputs=[
        state, category, funding, relationships,
        milestones, participants, vc, angel, top500
    ], outputs=result)

chatbot.launch(share=False)


Running on local URL:  http://127.0.0.1:7864

To create a public link, set `share=True` in `launch()`.




In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
import joblib

# 1. üì• CSV –¥–∞—Ç–∞–≥ —É–Ω—à–∏—Ö
df = pd.read_csv("C:/Users/Dell/Downloads/Ecn325 data (1).csv")  # ‚Üê –ó–∞–º—ã–≥ ”©”©—Ä–∏–π–Ω –∫–æ–º–ø—å—é—Ç–µ—Ä—Ç —Ç–∞–∞—Ä—É—É–ª–Ω–∞ —É—É

# 2. üéØ Target —Ö”©—Ä–≤“Ø“Ø–ª—ç—Ö ('acquired'=1, 'closed'=0)
df = df[df['status'].isin(['acquired', 'closed'])].copy()
df['status'] = df['status'].map({'acquired': 1, 'closed': 0})

# 3. üóëÔ∏è –•—ç—Ä—ç–≥–≥“Ø–π –±–∞–≥–∞–Ω—É—É–¥—ã–≥ —É—Å—Ç–≥–∞—Ö
columns_to_drop = [
    'Unnamed: 0', 'Unnamed: 6', 'id', 'name',
    'object_id', 'state_code.1'
]
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# 4. üè∑Ô∏è –ö–∞—Ç–µ–≥–æ—Ä–∏ –±–∞–≥–∞–Ω—É—É–¥—ã–≥ Label Encode —Ö–∏–π—Ö
label_encoders = {}
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    df[col] = df[col].fillna("unknown")
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# 5. üìä –¢–æ–æ–Ω —É—Ç–≥—É—É–¥—ã–Ω —Ö–æ–æ—Å–æ–Ω —É—Ç–≥—ã–≥ –¥—É–Ω–¥–∂–∞–∞—Ä –Ω”©—Ö”©—Ö
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())

# 6. üßÆ Features –±–∞ Target-–≥ —Å–∞–ª–≥–∞—Ö
X = df.drop(columns=["status"])
y = df["status"]

# 7. ‚öñÔ∏è –°—Ç–∞–Ω–¥–∞—Ä—Ç–∞–¥ –æ—Ä—É—É–ª–∞—Ö
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 8. üß™ SMOTE –∞—à–∏–≥–ª–∞–∂ balance —Ö–∏–π—Ö
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# 9. ‚úÇÔ∏è Train-test split —Ö–∏–π—Ö
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42
)

# 10. üîÅ Logistic Regression —Å—É—Ä–≥–∞–ª—Ç
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# 11. üìà “Æ—Ä –¥“Ø–Ω
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"‚úÖ Accuracy: {acc:.2%}")
print(f"üìä F1-score: {f1:.2%}")

# 12. üíæ –§–∞–π–ª—É—É–¥—ã–≥ —Ö–∞–¥–≥–∞–ª–∞—Ö
joblib.dump(model, "startup_model_logreg_smote.pkl")
joblib.dump(scaler, "scaler_logreg_smote.pkl")
joblib.dump(label_encoders, "label_encoders_logreg_smote.pkl")
joblib.dump(X.columns.tolist(), "feature_order_logreg_smote.pkl")
print("‚úÖ –ë“Ø—Ö .pkl —Ñ–∞–π–ª—É—É–¥ SMOTE –∑–∞–≥–≤–∞—Ä–∞–∞—Ä “Ø“Ø—Å–≥—ç–≥–¥–ª—ç—ç.")


‚úÖ Accuracy: 77.82%
üìä F1-score: 77.25%
‚úÖ –ë“Ø—Ö .pkl —Ñ–∞–π–ª—É—É–¥ SMOTE –∑–∞–≥–≤–∞—Ä–∞–∞—Ä “Ø“Ø—Å–≥—ç–≥–¥–ª—ç—ç.




In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import joblib

# 1. CSV –¥–∞—Ç–∞–≥ —É–Ω—à–∏—Ö
df = pd.read_csv("C:/Users/Dell/Downloads/Ecn325 data (1).csv")  # ‚Üê –∑–∞–º–∞–∞ —Ç–∞–∞—Ä—É—É–ª–Ω–∞ —É—É

# 2. Target —Ö”©—Ä–≤“Ø“Ø–ª—ç—Ö ('acquired'=1, 'closed'=0)
df = df[df['status'].isin(['acquired', 'closed'])].copy()
df['status'] = df['status'].map({'acquired': 1, 'closed': 0})

# 3. –•—ç—Ä—ç–≥–≥“Ø–π –±–∞–≥–∞–Ω—É—É–¥—ã–≥ —Ö–∞—Å–∞—Ö
columns_to_drop = [
    'Unnamed: 0', 'Unnamed: 6', 'id', 'name',
    'object_id', 'state_code.1'
]
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# 4. –ö–∞—Ç–µ–≥–æ—Ä–∏ —Ç–∞–ª–±–∞—Ä—É—É–¥—ã–≥ Label Encode
label_encoders = {}
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    df[col] = df[col].fillna("unknown")
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# 5. –¢–æ–æ–Ω –±–∞–≥–∞–Ω—É—É–¥—ã–Ω —Ö–æ–æ—Å–æ–Ω —É—Ç–≥—ã–≥ –¥—É–Ω–¥–∂–∞–∞—Ä –Ω”©—Ö”©—Ö
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())

# 6. Features –±–æ–ª–æ–Ω Target-–≥ —Å–∞–ª–≥–∞—Ö
X = df.drop(columns=["status"])
y = df["status"]

# 7. –°—Ç–∞–Ω–¥–∞—Ä—Ç–∞–¥ –æ—Ä—É—É–ª–∞—Ö
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 8. Train-Test split —Ö–∏–π—Ö
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42)

# 9. Logistic Regression model —Å—É—Ä–≥–∞–ª—Ç
model = LogisticRegression(
    class_weight='balanced',  # Imbalance-–≥ –∑–∞—Å–Ω–∞
    max_iter=1000,
    random_state=42
)
model.fit(X_train, y_train)

# 10. “Æ—Ä –¥“Ø–Ω
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"üéØ Accuracy: {acc:.2%}")
print(f"üìä F1-score: {f1:.2%}")

# 11. .pkl —Ñ–∞–π–ª—É—É–¥—ã–≥ —Ö–∞–¥–≥–∞–ª–∞—Ö
joblib.dump(model, "startup_model_logreg.pkl")
joblib.dump(scaler, "scaler_logreg.pkl")
joblib.dump(label_encoders, "label_encoders_logreg.pkl")
joblib.dump(X.columns.tolist(), "feature_order_logreg.pkl")
print("‚úÖ –ë“Ø—Ö .pkl —Ñ–∞–π–ª—É—É–¥ Logistic Regression —Ö—É–≤–∏–ª–±–∞—Ä–∞–∞—Ä “Ø“Ø—Å–≥—ç–≥–¥–ª—ç—ç.")


üéØ Accuracy: 68.65%
üìä F1-score: 72.90%
‚úÖ –ë“Ø—Ö .pkl —Ñ–∞–π–ª—É—É–¥ Logistic Regression —Ö—É–≤–∏–ª–±–∞—Ä–∞–∞—Ä “Ø“Ø—Å–≥—ç–≥–¥–ª—ç—ç.


In [11]:
import gradio as gr
import pandas as pd
import joblib

# 1. üì¶ Logistic –∑–∞–≥–≤–∞—Ä—ã–Ω .pkl —Ñ–∞–π–ª—É—É–¥—ã–≥ –∞—á–∞–∞–ª–∞—Ö
model = joblib.load("startup_model_logreg.pkl")
scaler = joblib.load("scaler_logreg.pkl")
label_encoders = joblib.load("label_encoders_logreg.pkl")
feature_order = joblib.load("feature_order_logreg.pkl")

# 2. üß† –¢–∞–∞–º–∞–≥ –≥–∞—Ä–≥–∞—Ö —Ñ—É–Ω–∫—Ü
def predict(state, category, funding, relationships, milestones, participants, vc, angel, top500):
    features = {col: 0 for col in feature_order}

    # –ú—É–∂
    if state in label_encoders['state_code'].classes_:
        features['state_code'] = label_encoders['state_code'].transform([state])[0]
    features[f"is_{state}" if f"is_{state}" in feature_order else "is_otherstate"] = 1

    # –°–∞–ª–±–∞—Ä
    if category in label_encoders['category_code'].classes_:
        features['category_code'] = label_encoders['category_code'].transform([category])[0]
    features[f"is_{category}" if f"is_{category}" in feature_order else "is_othercategory"] = 1

    # –¢–æ–æ–Ω –±–æ–ª–æ–Ω –ª–æ–≥–∏–∫ —Ç–∞–ª–±–∞—Ä—É—É–¥
    features['funding_total_usd'] = funding * 1_000_000
    features['relationships'] = relationships
    features['milestones'] = milestones
    features['avg_participants'] = participants
    features['has_VC'] = int(vc)
    features['has_angel'] = int(angel)
    features['is_top500'] = int(top500)

    # Default-required
    features['closed_at'] = 0
    features['status'] = 0

    df = pd.DataFrame([features])[feature_order]
    df_scaled = scaler.transform(df)
    prob = model.predict_proba(df_scaled)[0]
    pred = model.predict(df_scaled)[0]

    # ‚úÖ –ó”©–≤–ª”©–º–∂—Ç—ç–π —Ö–∞—Ä–∏—É –±—É—Ü–∞–∞—Ö
    if pred == 1:
        msg = f"‚úÖ –¢–∞–Ω–∞–π —Å—Ç–∞—Ä—Ç–∞–ø –∞–º–∂–∏–ª—Ç—Ç–∞–π –±–æ–ª–æ—Ö –º–∞–≥–∞–¥–ª–∞–ª: {prob[1]:.2%}.\n\nüöÄ –ó”©–≤–ª”©–º–∂: –ò–ª“Ø“Ø milestone —Ö—ç—Ä—ç–≥–∂“Ø“Ø–ª–∂, —Ö”©—Ä”©–Ω–≥”© –æ—Ä—É—É–ª–∞–ª—Ç–∞–∞ —Ç–æ–≥—Ç–≤–æ—Ä—Ç–æ–π –Ω—ç–º—ç–≥–¥“Ø“Ø–ª—ç—ç—Ä—ç–π."
    else:
        msg = f"‚ùå –¢–∞–Ω–∞–π —Å—Ç–∞—Ä—Ç–∞–ø –∞–º–∂–∏–ª—Ç–≥“Ø–π –±–æ–ª–æ—Ö –º–∞–≥–∞–¥–ª–∞–ª: {prob[0]:.2%}.\n\nüìâ –ó”©–≤–ª”©–º–∂: VC —ç—Å–≤—ç–ª angel —Ö”©—Ä”©–Ω–≥”© –æ—Ä—É—É–ª–∞–≥—á —Ç–∞—Ç–∞—Ö, –±–∞–≥–∏–π–Ω –±“Ø—Ç—Ü–∏–π–≥ –±—ç—Ö–∂“Ø“Ø–ª—ç—Ö —à–∞–∞—Ä–¥–ª–∞–≥–∞—Ç–∞–π –±–∞–π–∂ –º–∞–≥–∞–¥–≥“Ø–π."

    return msg

# 3. üåê Gradio chatbot-style interface
with gr.Blocks(title="Startup Chatbot (LogReg)") as chatbot:
    gr.Markdown("## ü§ñ –°—Ç–∞—Ä—Ç–∞–ø –ê–º–∂–∏–ª—Ç—ã–Ω –¢–∞–∞–º–∞–≥–ª–∞–≥—á")
    gr.Markdown("–¢–∞ –∞—Å—É—É–ª—Ç–∞–¥ —Ö–∞—Ä–∏—É–ª–∞–∞–¥ ML –∑–∞–≥–≤–∞—Ä—ã–Ω —Ç–∞–∞–º–∞–≥ + –∑”©–≤–ª”©–º–∂”©”© –∞–≤–∞–∞—Ä–∞–π.")

    with gr.Row():
        state = gr.Dropdown(["CA", "NY", "TX", "MA", "otherstate"], label="1. –ê–ª—å –º—É–∂–∏–¥ –±–∞–π—Ä–ª–∞–¥–∞–≥ –≤—ç?")
        category = gr.Dropdown(["biotech", "software", "web", "othercategory"], label="2. –Ø–º–∞—Ä —Å–∞–ª–±–∞—Ä—Ç –∞–∂–∏–ª–ª–∞–¥–∞–≥ –≤—ç?")

    funding = gr.Slider(0, 100, step=1, label="3. –•”©—Ä”©–Ω–≥”© (—Å–∞—è USD)", value=1)
    relationships = gr.Slider(0, 20, step=1, label="4. Co-founder –±–æ–ª–æ–Ω —Ö–∞—Ä–∏–ª—Ü–∞–∞–Ω—ã —Ç–æ–æ", value=3)
    milestones = gr.Slider(0, 10, step=1, label="5. Milestone-–∏–π–Ω —Ç–æ–æ", value=2)
    participants = gr.Slider(0, 10, step=1, label="6. –î—É–Ω–¥–∞–∂ —Ö”©—Ä”©–Ω–≥”© –æ—Ä—É—É–ª–∞–≥—á–∏–π–Ω —Ç–æ–æ", value=4)

    vc = gr.Checkbox(label="7. VC —Ö”©—Ä”©–Ω–≥”© –æ—Ä—É—É–ª–∞–≥—á—Ç–∞–π —é—É?")
    angel = gr.Checkbox(label="8. Angel —Ö”©—Ä”©–Ω–≥”© –æ—Ä—É—É–ª–∞–≥—á—Ç–∞–π —é—É?")
    top500 = gr.Checkbox(label="9. Top 500-–¥ –±–∞–≥—Ç—Å–∞–Ω —É—É?")

    btn = gr.Button("üìä –¢–∞–∞–º–∞–≥ –≥–∞—Ä–≥–∞—Ö")
    result = gr.Textbox(label="üß† –¢–∞–∞–º–∞–≥ –±–∞ –ó”©–≤–ª”©–º–∂")

    btn.click(fn=predict, inputs=[
        state, category, funding, relationships,
        milestones, participants, vc, angel, top500
    ], outputs=result)

chatbot.launch(share=False)


Running on local URL:  http://127.0.0.1:7865

Thanks for being a Gradio user! If you have questions or feedback, please join our Discord server and chat with us: https://discord.gg/feTf9x3ZSB

To create a public link, set `share=True` in `launch()`.




In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import joblib

# 1. üì• CSV —É–Ω—à–∏—Ö
df = pd.read_csv("C:/Users/Dell/Downloads/Ecn325 data (1).csv")

# 2. üéØ Target: 'status' -> acquired: 1, closed: 0
df = df[df['status'].isin(['acquired', 'closed'])].copy()
df['status'] = df['status'].map({'acquired': 1, 'closed': 0})

# 3. üóëÔ∏è –•—ç—Ä—ç–≥–≥“Ø–π –±–∞–≥–∞–Ω—É—É–¥—ã–≥ —Ö–∞—Å–∞—Ö
columns_to_drop = [
    'Unnamed: 0', 'Unnamed: 6', 'id', 'name',
    'object_id', 'state_code.1', 'labels', 'closed_at'
]
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# 4. üè∑Ô∏è –ö–∞—Ç–µ–≥–æ—Ä–∏ –±–∞–≥–∞–Ω—É—É–¥—ã–≥ Label Encode —Ö–∏–π—Ö
label_encoders = {}
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    df[col] = df[col].fillna("unknown")
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# 5. üìä –¢–æ–æ–Ω –±–∞–≥–∞–Ω—É—É–¥—ã–Ω null-—É—É–¥—ã–≥ –¥—É–Ω–¥–∂–∞–∞—Ä –Ω”©—Ö”©—Ö
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())

# 6. üîÄ Features –±–æ–ª–æ–Ω target-–≥ —Å–∞–ª–≥–∞—Ö
X = df.drop(columns=["status"])
y = df["status"]

# 7. ‚öñÔ∏è –°—Ç–∞–Ω–¥–∞—Ä—Ç–∞–¥ –æ—Ä—É—É–ª–∞—Ö
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 8. ‚úÇÔ∏è Train-Test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# 9. üå≤ –ó–∞–≥–≤–∞—Ä —Å—É—Ä–≥–∞–ª—Ç (–∏–ª“Ø“Ø —Å–∞–π–Ω —Ç–æ—Ö–∏—Ä–≥–æ–æ—Ç–æ–π RandomForest)
model = RandomForestClassifier(
    n_estimators=200, max_depth=10, class_weight='balanced', random_state=42
)
model.fit(X_train, y_train)

# 10. üìà “Æ—Ä –¥“Ø–Ω —à–∞–ª–≥–∞—Ö
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"‚úÖ –ó–∞–≥–≤–∞—Ä—ã–Ω Accuracy: {acc:.2%}")
print(f"üìä F1-score: {f1:.2%}")

# 11. üíæ –ë“Ø—Ö .pkl —Ñ–∞–π–ª—É—É–¥—ã–≥ —Ö–∞–¥–≥–∞–ª–∞—Ö
joblib.dump(model, "startup_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")
joblib.dump(X.columns.tolist(), "feature_order.pkl")
print("‚úÖ –ë“Ø—Ö .pkl —Ñ–∞–π–ª—É—É–¥ –∞–º–∂–∏–ª—Ç—Ç–∞–π —Ö–∞–¥–≥–∞–ª–∞–≥–¥–ª–∞–∞.")


‚úÖ –ó–∞–≥–≤–∞—Ä—ã–Ω Accuracy: 78.38%
üìä F1-score: 84.13%
‚úÖ –ë“Ø—Ö .pkl —Ñ–∞–π–ª—É—É–¥ –∞–º–∂–∏–ª—Ç—Ç–∞–π —Ö–∞–¥–≥–∞–ª–∞–≥–¥–ª–∞–∞.


In [15]:
import gradio as gr
import pandas as pd
import joblib
import re
import traceback

# 1. –ó–∞–≥–≤–∞—Ä –±–æ–ª–æ–Ω —Ç—É—Å–ª–∞—Ö —Ñ–∞–π–ª—É—É–¥—ã–≥ –∞—á–∞–∞–ª–∞—Ö
model = joblib.load("startup_model.pkl")
scaler = joblib.load("scaler.pkl")
label_encoders = joblib.load("label_encoders.pkl")
feature_order = joblib.load("feature_order.pkl")

# 2. Feature –≥–∞—Ä–≥–∞—Ö ”©—Ä–≥”©—Ç–≥”©—Å”©–Ω —Ñ—É–Ω–∫—Ü
def extract_features(text):
    features = {col: 0 for col in feature_order}

    # --- –ë–∞–π—Ä—à–∏–ª ---
    if re.search(r'\b(CA|California)\b', text, re.I):
        if 'state_code' in label_encoders and 'CA' in label_encoders['state_code'].classes_:
            features['state_code'] = label_encoders['state_code'].transform(['CA'])[0]
            features['is_CA'] = 1
    elif re.search(r'\b(NY|New York)\b', text, re.I):
        if 'state_code' in label_encoders and 'NY' in label_encoders['state_code'].classes_:
            features['state_code'] = label_encoders['state_code'].transform(['NY'])[0]
            features['is_NY'] = 1
    else:
        features['is_otherstate'] = 1

    # --- –°–∞–ª–±–∞—Ä ---
    if 'category_code' in label_encoders:
        if re.search(r'biotech', text, re.I):
            features['category_code'] = label_encoders['category_code'].transform(['biotech'])[0]
            features['is_biotech'] = 1
        elif re.search(r'software', text, re.I):
            features['category_code'] = label_encoders['category_code'].transform(['software'])[0]
            features['is_software'] = 1
        elif re.search(r'web', text, re.I):
            features['category_code'] = label_encoders['category_code'].transform(['web'])[0]
            features['is_web'] = 1
        else:
            features['is_othercategory'] = 1

    # --- –•”©—Ä”©–Ω–≥”© –æ—Ä—É—É–ª–∞–ª—Ç ---
    match = re.search(r'(\d+(?:\.\d+)?)\s*(—Å–∞—è|million|m)', text, re.I)
    if match:
        features['funding_total_usd'] = float(match.group(1)) * 1_000_000
    else:
        features['funding_total_usd'] = 1_000_000  # default

    # --- –•”©—Ä”©–Ω–≥”© –æ—Ä—É—É–ª–∞–≥—á–∏–¥ ---
    if "angel" in text.lower():
        features['has_angel'] = 1
    if "vc" in text.lower() or "venture" in text.lower():
        features['has_VC'] = 1

    # --- –•–∞—Ä–∏–ª—Ü–∞–∞, milestone, –æ—Ä–æ–ª—Ü–æ–≥—á–∏–¥, top500 ---
    match = re.search(r'(\d+)\s+(—Ö–∞–º—Ç—Ä–∞–≥—á|co[- ]?founder|partner)', text, re.I)
    if match:
        features['relationships'] = int(match.group(1))
    else:
        features['relationships'] = 3

    match = re.search(r'(\d+)\s+(milestone|—à–∞—Ç)', text, re.I)
    if match:
        features['milestones'] = int(match.group(1))
    else:
        features['milestones'] = 2

    match = re.search(r'(\d+)\s+(investor|—Ö”©—Ä”©–Ω–≥”© –æ—Ä—É—É–ª–∞–≥—á)', text, re.I)
    if match:
        features['avg_participants'] = int(match.group(1))
    else:
        features['avg_participants'] = 4

    if re.search(r'top\s*500', text, re.I):
        features['is_top500'] = 1

    # –ó–∞–π–ª—à–≥“Ø–π —Ç–∞–ª–±–∞—Ä—É—É–¥
    features['closed_at'] = 0
    features['status'] = 0

    return features

# 3. –¢–∞–∞–º–∞–≥ –≥–∞—Ä–≥–∞—Ö “Ø–Ω–¥—Å—ç–Ω —Ñ—É–Ω–∫—Ü
def smart_predict(text):
    try:
        input_data = extract_features(text)
        df = pd.DataFrame([input_data])
        df = df[feature_order]  # –∑”©–≤ –¥–∞—Ä–∞–∞–ª–ª—ã–≥ –±–∞—Ä–∏–º—Ç–∞–ª–Ω–∞
        df_scaled = scaler.transform(df)

        prob = model.predict_proba(df_scaled)[0]
        pred = model.predict(df_scaled)[0]

        if pred == 1:
            return f"‚úÖ –ê–º–∂–∏–ª—Ç—Ç–∞–π –±–æ–ª–æ—Ö –º–∞–≥–∞–¥–ª–∞–ª: {prob[1]:.2%}"
        else:
            return f"‚ùå –ê–º–∂–∏–ª—Ç–≥“Ø–π –±–æ–ª–æ—Ö –º–∞–≥–∞–¥–ª–∞–ª: {prob[0]:.2%}"
    except Exception as e:
        traceback.print_exc()
        return "‚ö†Ô∏è –ê–ª–¥–∞–∞ –≥–∞—Ä–ª–∞–∞. –¢–∞ —Ç–µ—Ä–º–∏–Ω–∞–ª–∞–∞—Å —à–∞–ª–≥–∞–Ω–∞ —É—É."

# 4. Gradio –∏–Ω—Ç–µ—Ä—Ñ—ç–π—Å
iface = gr.Interface(
    fn=smart_predict,
    inputs=gr.Textbox(lines=4, label="–°—Ç–∞—Ä—Ç–∞–ø—ã–Ω—Ö–∞–∞ –º—ç–¥—ç—ç–ª–ª–∏–π–≥ –±–∏—á–Ω—ç “Ø“Ø"),
    outputs="text",
    title="–°—Ç–∞—Ä—Ç–∞–ø –ê–º–∂–∏–ª—Ç—ã–Ω –¢–∞–∞–º–∞–≥–ª–∞–≥—á",
    description="–ñ–∏—à—ç—ç: '–ú–∞–Ω–∞–π —Å—Ç–∞—Ä—Ç–∞–ø CA-–¥ –±–∞–π—Ä–ª–∞–¥–∞–≥, biotech —Å–∞–ª–±–∞—Ä—Ç –∞–∂–∏–ª–ª–∞–¥–∞–≥, 5 —Å–∞—è –¥–æ–ª–ª–∞—Ä—ã–Ω —Ö”©—Ä”©–Ω–≥”© –∞–≤—Å–∞–Ω'",
    examples=[
        ["–ú–∞–Ω–∞–π —Å—Ç–∞—Ä—Ç–∞–ø CA-–¥ –±–∞–π—Ä–ª–∞–¥–∞–≥, biotech —Å–∞–ª–±–∞—Ä—Ç –∞–∂–∏–ª–ª–∞–¥–∞–≥, 5 —Å–∞—è –¥–æ–ª–ª–∞—Ä—ã–Ω —Ö”©—Ä”©–Ω–≥”© –∞–≤—Å–∞–Ω, VC –æ—Ä–æ–ª—Ü—Å–æ–Ω"],
        ["Software —Ö–∏–π–¥—ç–≥, angel —Ö”©—Ä”©–Ω–≥”© –æ—Ä—É—É–ª–∞–≥—á—Ç–∞–π"],
        ["2 milestone-—Ç–æ–π, 5 —Ö”©—Ä”©–Ω–≥”© –æ—Ä—É—É–ª–∞–≥—á—Ç–∞–π, top 500-–¥ –±–∞–≥—Ç—Å–∞–Ω"],
        ["New York-based startup with 3 co-founders and 2 rounds of funding"]
    ]
)

iface.launch(share=True)

Running on local URL:  http://127.0.0.1:7871
Running on public URL: https://5abeeaa70477407867.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


