In [69]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report


In [70]:

# Load the dataset
data = pd.read_csv('bleh.csv')

# Sorting the data to ensure we can find the first funding instance for each company
data_sorted = data.sort_values(by=['Company', 'Year'], ascending=[True, True])
# data_sorted
# data_sorted.loc[data_sorted['Company'] == 'Groww']

In [71]:
# Dropping the funding details while keeping the first occurrence of each company to preserve the social media data
social_media_data = data_sorted.drop_duplicates(subset=['Company'], keep='first').drop([ 'NewAmountUSD',  'Year'], axis=1)

social_media_data

Unnamed: 0,Company,avg_likes,avg_retweets,avg_hashtags,avg_senti,avg_mentions,avg_tweet_length,avg_links_mentioned,avg_replies,Growth,FDI
64,91springboard,4.710000,4.030000,0.49000,0.551410,0.630000,112.370000,1.310000,0.230000,8.26,1.937364
65,betterplace,3.453333,0.693333,0.36000,0.557348,0.840000,93.853333,0.773333,0.293333,8.26,1.937364
83,bira91,12.110000,2.670000,1.17000,0.607719,0.500000,145.820000,1.130000,2.590000,3.74,1.784826
36,bookeventz,0.310000,0.100000,2.28000,0.528376,0.280000,111.450000,1.400000,0.000000,8.00,2.092115
45,box8,0.139241,0.063291,0.21519,0.624321,0.772152,98.139241,0.582278,0.202532,8.00,2.092115
...,...,...,...,...,...,...,...,...,...,...,...
60,wooplr,2.400000,1.490000,2.04000,0.604832,0.690000,137.060000,1.350000,0.560000,8.26,1.937364
61,yufta,1.380000,0.990000,0.32000,0.536455,0.470000,62.020000,0.470000,0.480000,8.26,1.937364
40,zappfresh,1.090000,0.870000,3.10000,0.611210,0.130000,138.200000,1.330000,0.050000,8.00,2.092115
77,zestmoney,10.230000,5.580000,2.13000,0.598702,0.670000,162.410000,0.780000,6.360000,6.80,1.507316


In [72]:
# Now, let's create a dataframe which contains only the first funding instance for each company
first_funding_data = data_sorted.dropna(subset=['Year']).drop_duplicates(subset=['Company'], keep='first')

first_funding_data

Unnamed: 0,Company,avg_likes,avg_retweets,avg_hashtags,avg_senti,avg_mentions,avg_tweet_length,avg_links_mentioned,avg_replies,NewAmountUSD,Year,Growth,FDI
64,91springboard,4.710000,4.030000,0.49000,0.551410,0.630000,112.370000,1.310000,0.230000,0,2016,8.26,1.937364
65,betterplace,3.453333,0.693333,0.36000,0.557348,0.840000,93.853333,0.773333,0.293333,0,2016,8.26,1.937364
83,bira91,12.110000,2.670000,1.17000,0.607719,0.500000,145.820000,1.130000,2.590000,10000000,2019,3.74,1.784826
36,bookeventz,0.310000,0.100000,2.28000,0.528376,0.280000,111.450000,1.400000,0.000000,0,2015,8.00,2.092115
45,box8,0.139241,0.063291,0.21519,0.624321,0.772152,98.139241,0.582278,0.202532,3500000,2015,8.00,2.092115
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,wooplr,2.400000,1.490000,2.04000,0.604832,0.690000,137.060000,1.350000,0.560000,0,2016,8.26,1.937364
61,yufta,1.380000,0.990000,0.32000,0.536455,0.470000,62.020000,0.470000,0.480000,0,2016,8.26,1.937364
40,zappfresh,1.090000,0.870000,3.10000,0.611210,0.130000,138.200000,1.330000,0.050000,77000,2015,8.00,2.092115
77,zestmoney,10.230000,5.580000,2.13000,0.598702,0.670000,162.410000,0.780000,6.360000,6500000,2017,6.80,1.507316


In [73]:
# Merge the social media data with the first funding data
merged_data = pd.merge(social_media_data, first_funding_data, on='Company', how='left')
merged_data

Unnamed: 0,Company,avg_likes_x,avg_retweets_x,avg_hashtags_x,avg_senti_x,avg_mentions_x,avg_tweet_length_x,avg_links_mentioned_x,avg_replies_x,Growth_x,...,avg_hashtags_y,avg_senti_y,avg_mentions_y,avg_tweet_length_y,avg_links_mentioned_y,avg_replies_y,NewAmountUSD,Year,Growth_y,FDI_y
0,91springboard,4.710000,4.030000,0.49000,0.551410,0.630000,112.370000,1.310000,0.230000,8.26,...,0.49000,0.551410,0.630000,112.370000,1.310000,0.230000,0,2016,8.26,1.937364
1,betterplace,3.453333,0.693333,0.36000,0.557348,0.840000,93.853333,0.773333,0.293333,8.26,...,0.36000,0.557348,0.840000,93.853333,0.773333,0.293333,0,2016,8.26,1.937364
2,bira91,12.110000,2.670000,1.17000,0.607719,0.500000,145.820000,1.130000,2.590000,3.74,...,1.17000,0.607719,0.500000,145.820000,1.130000,2.590000,10000000,2019,3.74,1.784826
3,bookeventz,0.310000,0.100000,2.28000,0.528376,0.280000,111.450000,1.400000,0.000000,8.00,...,2.28000,0.528376,0.280000,111.450000,1.400000,0.000000,0,2015,8.00,2.092115
4,box8,0.139241,0.063291,0.21519,0.624321,0.772152,98.139241,0.582278,0.202532,8.00,...,0.21519,0.624321,0.772152,98.139241,0.582278,0.202532,3500000,2015,8.00,2.092115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,wooplr,2.400000,1.490000,2.04000,0.604832,0.690000,137.060000,1.350000,0.560000,8.26,...,2.04000,0.604832,0.690000,137.060000,1.350000,0.560000,0,2016,8.26,1.937364
80,yufta,1.380000,0.990000,0.32000,0.536455,0.470000,62.020000,0.470000,0.480000,8.26,...,0.32000,0.536455,0.470000,62.020000,0.470000,0.480000,0,2016,8.26,1.937364
81,zappfresh,1.090000,0.870000,3.10000,0.611210,0.130000,138.200000,1.330000,0.050000,8.00,...,3.10000,0.611210,0.130000,138.200000,1.330000,0.050000,77000,2015,8.00,2.092115
82,zestmoney,10.230000,5.580000,2.13000,0.598702,0.670000,162.410000,0.780000,6.360000,6.80,...,2.13000,0.598702,0.670000,162.410000,0.780000,6.360000,6500000,2017,6.80,1.507316


In [74]:
# Create the binary target variable 'Funded' which is 'Yes' if 'NewAmountUSD' is greater than 0, 'No' otherwise
merged_data['Funded'] = np.where(merged_data['NewAmountUSD'] > 0, 'Yes', 'No')

merged_data

Unnamed: 0,Company,avg_likes_x,avg_retweets_x,avg_hashtags_x,avg_senti_x,avg_mentions_x,avg_tweet_length_x,avg_links_mentioned_x,avg_replies_x,Growth_x,...,avg_senti_y,avg_mentions_y,avg_tweet_length_y,avg_links_mentioned_y,avg_replies_y,NewAmountUSD,Year,Growth_y,FDI_y,Funded
0,91springboard,4.710000,4.030000,0.49000,0.551410,0.630000,112.370000,1.310000,0.230000,8.26,...,0.551410,0.630000,112.370000,1.310000,0.230000,0,2016,8.26,1.937364,No
1,betterplace,3.453333,0.693333,0.36000,0.557348,0.840000,93.853333,0.773333,0.293333,8.26,...,0.557348,0.840000,93.853333,0.773333,0.293333,0,2016,8.26,1.937364,No
2,bira91,12.110000,2.670000,1.17000,0.607719,0.500000,145.820000,1.130000,2.590000,3.74,...,0.607719,0.500000,145.820000,1.130000,2.590000,10000000,2019,3.74,1.784826,Yes
3,bookeventz,0.310000,0.100000,2.28000,0.528376,0.280000,111.450000,1.400000,0.000000,8.00,...,0.528376,0.280000,111.450000,1.400000,0.000000,0,2015,8.00,2.092115,No
4,box8,0.139241,0.063291,0.21519,0.624321,0.772152,98.139241,0.582278,0.202532,8.00,...,0.624321,0.772152,98.139241,0.582278,0.202532,3500000,2015,8.00,2.092115,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,wooplr,2.400000,1.490000,2.04000,0.604832,0.690000,137.060000,1.350000,0.560000,8.26,...,0.604832,0.690000,137.060000,1.350000,0.560000,0,2016,8.26,1.937364,No
80,yufta,1.380000,0.990000,0.32000,0.536455,0.470000,62.020000,0.470000,0.480000,8.26,...,0.536455,0.470000,62.020000,0.470000,0.480000,0,2016,8.26,1.937364,No
81,zappfresh,1.090000,0.870000,3.10000,0.611210,0.130000,138.200000,1.330000,0.050000,8.00,...,0.611210,0.130000,138.200000,1.330000,0.050000,77000,2015,8.00,2.092115,Yes
82,zestmoney,10.230000,5.580000,2.13000,0.598702,0.670000,162.410000,0.780000,6.360000,6.80,...,0.598702,0.670000,162.410000,0.780000,6.360000,6500000,2017,6.80,1.507316,Yes


In [76]:
# Selecting features for LDA
features = ['avg_likes_x',	'avg_retweets_x',	'avg_hashtags_x',	'avg_senti_x',	'avg_mentions_x',	'avg_tweet_length_x',	'avg_links_mentioned_x',
	'avg_likes_y',	'avg_retweets_y',	'avg_hashtags_y',	'avg_senti_y',	'avg_mentions_y',	'avg_tweet_length_y',	'avg_links_mentioned_y']
X = merged_data[features]
y = merged_data['Funded']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardizing the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Perform LDA
lda = LinearDiscriminantAnalysis()
lda.fit(X_train_scaled, y_train)

# Making predictions
y_pred = lda.predict(X_test_scaled)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
coefficients = lda.coef_[0]

# Output the cleaned and merged dataset
# merged_data.to_csv('/mnt/data/cleaned_and_merged_data.csv', index=False)

# Display the accuracy and classification report
print("Accuracy of the LDA model:", accuracy)
print("Classification Report:\n", report)

Accuracy of the LDA model: 0.6153846153846154
Classification Report:
               precision    recall  f1-score   support

          No       0.43      0.33      0.38         9
         Yes       0.68      0.76      0.72        17

    accuracy                           0.62        26
   macro avg       0.56      0.55      0.55        26
weighted avg       0.60      0.62      0.60        26



In [77]:
print("Feature Coefficients:")
for feature, coef in zip(features, coefficients):
    print(f"{feature}: {coef}")

Feature Coefficients:
avg_likes_x: 0.10688389498563926
avg_retweets_x: -0.049114192902257284
avg_hashtags_x: -0.07664170832051914
avg_senti_x: 0.04878651806772645
avg_mentions_x: -0.12824358565483682
avg_tweet_length_x: 0.14679698261917143
avg_links_mentioned_x: -0.06814482136352973
avg_likes_y: 0.10688389498563941
avg_retweets_y: -0.04911419290225737
avg_hashtags_y: -0.07664170832051925
avg_senti_y: 0.04878651806772642
avg_mentions_y: -0.12824358565483682
avg_tweet_length_y: 0.14679698261917154
avg_links_mentioned_y: -0.06814482136352973


In [78]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample

# Check the balance of the classes
print(merged_data['Funded'].value_counts())




Yes    51
No     33
Name: Funded, dtype: int64


In [79]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_score

# Assuming the data is already loaded and split into X_train, X_test, y_train, y_test

# Feature Engineering: create polynomial features, interactions, etc.
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Feature Selection with Recursive Feature Elimination and Cross-Validation
selector = RFECV(estimator=RandomForestClassifier(), step=1, cv=5)
selector = selector.fit(X_train, y_train)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# Use a more complex model like Random Forest
rf = RandomForestClassifier()

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV
# # param_grid = { ... } # Define a parameter grid 
# grid_search = GridSearchCV(rf, param_grid, cv=5)
# grid_search.fit(X_train, y_train)
# best_model = grid_search.best_estimator_

# Fit the model
rf.fit(X_train, y_train)

# Cross-validation
cv_scores = cross_val_score(rf, X_train, y_train, cv=5)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the results
print("Cross-validated scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())
print("Accuracy of the Random Forest model:", accuracy)
print("Classification Report:\n", report)


Cross-validated scores: [0.33333333 0.41666667 0.16666667 0.63636364 0.63636364]
Mean CV score: 0.43787878787878787
Accuracy of the Random Forest model: 0.6153846153846154
Classification Report:
               precision    recall  f1-score   support

          No       0.33      0.11      0.17         9
         Yes       0.65      0.88      0.75        17

    accuracy                           0.62        26
   macro avg       0.49      0.50      0.46        26
weighted avg       0.54      0.62      0.55        26

