 Startups


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Load the dataset
data_path = '/content/50_Startups (1).csv'  # Make sure to update this path
startups_data = pd.read_csv(data_path)

# One-hot encode the 'State' column
startups_data_encoded = pd.get_dummies(startups_data, columns=['State'], drop_first=True)

# Splitting the dataset into features (X) and target (y)
X = startups_data_encoded.drop('Profit', axis=1)
y = startups_data_encoded['Profit']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor()
}

# Dictionary to store R^2 values
r2_values = {}

# Train each model, make predictions, and calculate R^2 values
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2_values[name] = r2_score(y_test, y_pred)

# Convert the R^2 values to a DataFrame for better presentation
r2_values_df = pd.DataFrame(list(r2_values.items()), columns=['Model', 'R^2 Value'])
r2_values_df.sort_values(by='R^2 Value', ascending=False, inplace=True)
r2_values_df.reset_index(drop=True, inplace=True)

print(r2_values_df)


                     Model  R^2 Value
0  Random Forest Regressor   0.899095
1         Ridge Regression   0.898878
2         Lasso Regression   0.898734
3        Linear Regression   0.898727
4  Decision Tree Regressor   0.817853


Given these results, if you were to choose a model for predicting profit in this context, Ridge or Lasso Regression would be strong candidates due to their balance of simplicity and predictive power.
