In [20]:
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [105]:
# all variables
fb_train = pd.read_csv("./facebook_train.csv")
fb_train = pd.concat([fb_train, pd.get_dummies(fb_train["type"]), pd.get_dummies(fb_train["category"]), pd.get_dummies(fb_train["weekday"]), pd.get_dummies(fb_train["month"]), pd.get_dummies(fb_train["hour"])], axis=1)
fb_train = fb_train.drop(["type", "category", "month", "weekday", "hour"], axis=1)
fb_train.columns = ["followers", "paid", "comments", "likes", "shares", "Link", "Photo", "Status", "Video", "Category 1", "Category 2", "Category 3", "Mon","Tue","Wed","Thu","Fri","Sat","Sun", "Jan","Feb","Mar","May","Apr","Jun", "Jul", "Aug","Sep","Oct","Nov", "Dez", "1am", "2am", "3am", "4am", "5am", "6am", "7am", "8am", "9am", "12pm", "3pm", "4pm", "5pm", "6pm", "7pm", "8pm", "9pm","10pm", "11pm","12am"]

In [36]:
#no hours and above
fb_train = pd.read_csv("./facebook_train.csv")
fb_train = pd.concat([fb_train, pd.get_dummies(fb_train["type"]), pd.get_dummies(fb_train["category"]), pd.get_dummies(fb_train["weekday"]), pd.get_dummies(fb_train["month"])], axis=1)
fb_train = fb_train.drop(["type", "category", "month", "weekday", "hour"], axis=1)
fb_train.columns = ["followers", "paid", "comments", "likes", "shares", "Link", "Photo", "Status", "Video", "Category 1", "Category 2", "Category 3", "Mon","Tue","Wed","Thu","Fri","Sat","Sun", "Jan","Feb","Mar","May","Apr","Jun", "Jul", "Aug","Sep","Oct","Nov", "Dez"]

In [41]:
#no month and above
fb_train = pd.read_csv("./facebook_train.csv")
fb_train = pd.concat([fb_train, pd.get_dummies(fb_train["type"]), pd.get_dummies(fb_train["category"]), pd.get_dummies(fb_train["weekday"])], axis=1)
fb_train = fb_train.drop(["type", "category", "month", "weekday", "hour"], axis=1)
fb_train.columns = ["followers", "paid", "comments", "likes", "shares", "Link", "Photo", "Status", "Video", "Category 1", "Category 2", "Category 3", "Mon","Tue","Wed","Thu","Fri","Sat","Sun"]

In [96]:
#no day of the week and above
fb_train = pd.read_csv("./facebook_train.csv")
fb_train = pd.concat([fb_train, pd.get_dummies(fb_train["type"]), pd.get_dummies(fb_train["category"])], axis=1)
fb_train = fb_train.drop(["type", "category", "month", "weekday", "hour"], axis=1)
fb_train.columns = ["followers", "paid", "comments", "likes", "shares", "Link", "Photo", "Status", "Video", "Category 1", "Category 2", "Category 3"]

In [100]:
#only paid
fb_train = pd.read_csv("./facebook_train.csv")
fb_train = pd.concat([fb_train, pd.get_dummies(fb_train["type"])], axis=1)
fb_train = fb_train.drop(["total_followers", "type", "category", "month", "weekday", "hour","paid"], axis=1)
fb_train.columns = ["comments", "likes", "shares","Link", "Photo", "Status", "Video"]

In [146]:
#splitting model
fb_model = fb_train.drop(["comments", "shares"], axis=1)
fb_target = fb_model.likes
x_train, x_test, y_train, y_test = train_test_split(fb_model, fb_target, test_size=0.2, random_state=0)
x_train = x_train.drop(["likes"], axis=1)
y_train

64       89
55       18
316     430
102    2306
261     122
       ... 
323     618
192      67
117    1479
47      421
172     423
Name: likes, Length: 317, dtype: int64

In [147]:
#running regression and cross validation
reg = linear_model.LinearRegression()
scores = cross_val_score(reg, x_train, y_train, cv=10)
print(scores)
reg.fit(x_train, y_train)

[-0.01836078 -0.79644755 -0.96533959 -0.36696886 -0.69573124 -2.8097023
 -0.50664432 -0.04484552 -0.05080879 -1.22232566]


LinearRegression()

In [148]:
#displaying coefficients for linear model
results_df = pd.DataFrame({"var": fb_model.drop(["likes"], axis=1).columns, "coef": reg.coef_})

results_df.round(2).sort_values('coef',ascending=False)

Unnamed: 0,var,coef
42,7pm,647.67
16,Jan,581.07
17,Feb,415.74
18,Mar,216.7
31,4am,209.01
34,7am,175.54
19,May,158.09
1,paid,102.86
15,Sun,98.36
14,Sat,63.47


In [149]:
#showing results
print(mean_squared_error(y_test, reg.predict(x_test.drop(["likes"], axis=1))))
print(mean_absolute_error(y_test, reg.predict(x_test.drop(["likes"], axis=1))))
print(r2_score(y_test, reg.predict(x_test.drop(["likes"], axis=1))))

131740.65351615925
212.83526390027322
-0.24350268549993337


# Interpretation

The linear model is performing very poorly as suggested by the cross-validation test and the negative scores – a negative score indicates that simply taking the mean would perform better than the model itself. This is very likely since there shouldn't be a linear relationship between the number of likes for a given post and all the other variables. Moreover, some of the variables are not complete (such as time of the day).

To check if this occurs due to model complexity I ran the regression with multiple number of reduced parameters. The more parameters I withhold, the better the prediction. However, this assumes a linear relationship which I am then just optimizing for. Instead it would have been better to find a non-polynomial or n-degree polynomial model that better represents the underlying relationship and bias.

When running the regression with all parameters, the following seems to influence the number of likes the most:

- Either early morning or evening (but not too late)
- Beginning of the year
- End of the week
- Paid
- Category 2
- Video or Photo

The opposite of that seems to reduce likes the most (aka late in the evening, mid to end of the year).

In [115]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier 

In [150]:
#loading and inspecting data
casual_data = pd.read_csv("./casualty_train.csv")
casual_data = casual_data[(casual_data['age'] != 'Unknown')]
casual_data = casual_data[(casual_data['pedestrian_location'] != 'Unknown')]
print(casual_data.head())
print(casual_data.pedestrian_location.unique())
print(casual_data.pedestrian_movement.unique())

  casualty_class  gender age  severe pedestrian_location pedestrian_movement  \
0      passenger  female  33   False                 NaN                 NaN   
1      passenger  female  20   False                 NaN                 NaN   
2      passenger    male  52   False                 NaN                 NaN   
3      passenger  female  17   False                 NaN                 NaN   
4      passenger  female  20   False                 NaN                 NaN   

       travel  year  
0   motorbike  2007  
1         car  2005  
2         car  2006  
3  pedestrian  2012  
4   motorbike  2010  
[nan 'In road' 'Close to Crossing' 'Not on Crossing' 'Pedestrian Crossing'
 'Footpath' 'Zig-Zag']
[nan 'In Rd Not Crossing' 'Drivers N/Side' 'Drivers O/Side'
 'Unknown Or Other' 'Back To Traffic' 'Drivers N/Side Msk'
 'Drivers O/Side Msk' 'Facing Traffic' 'In Rd Not Crossing Msk']


In [152]:
#all variables
casual_data_dummy = pd.concat([casual_data, pd.get_dummies(casual_data["casualty_class"]), pd.get_dummies(casual_data["gender"]), pd.get_dummies(casual_data["travel"]), pd.get_dummies(casual_data["pedestrian_location"]), pd.get_dummies(casual_data["pedestrian_movement"]), pd.get_dummies(casual_data["year"])], axis=1).drop(["casualty_class", "gender", "pedestrian_location", "pedestrian_movement", "travel","year"],axis=1)

In [153]:
#no pedestrian movement
casual_data_dummy = pd.concat([casual_data, pd.get_dummies(casual_data["casualty_class"]), pd.get_dummies(casual_data["gender"]), pd.get_dummies(casual_data["travel"]), pd.get_dummies(casual_data["pedestrian_location"]), pd.get_dummies(casual_data["year"])], axis=1).drop(["casualty_class", "gender", "pedestrian_location", "pedestrian_movement", "travel","year"],axis=1)

In [154]:
#no pedestrian location and above
casual_data_dummy = pd.concat([casual_data, pd.get_dummies(casual_data["casualty_class"]), pd.get_dummies(casual_data["gender"]), pd.get_dummies(casual_data["travel"]), pd.get_dummies(casual_data["year"])], axis=1).drop(["casualty_class", "gender", "pedestrian_location", "pedestrian_movement", "travel","year"],axis=1)

In [155]:
#no year and above
casual_data_dummy = pd.concat([casual_data, pd.get_dummies(casual_data["casualty_class"]), pd.get_dummies(casual_data["gender"]), pd.get_dummies(casual_data["travel"])], axis=1).drop(["casualty_class", "gender", "pedestrian_location", "pedestrian_movement", "travel","year"],axis=1)

In [156]:
#data splitting
x_train, x_test, y_train, y_test = train_test_split(casual_data_dummy.drop(["severe"], axis=1), casual_data_dummy.severe, test_size=0.2, random_state=0)
x_train.columns

Index(['age', 'driver', 'passenger', 'pedestrian', 'female', 'male', 'bicycle',
       'bus', 'car', 'motorbike', 'other', 'pedestrian', 'taxi'],
      dtype='object')

In [157]:
#running classification
rf = RandomForestClassifier()
rf.fit(x_train, y_train) 
print(rf.score(x_test, y_test))

0.8925652640919068


In [158]:
#showing variable importance
feature_importances = pd.DataFrame(rf.feature_importances_, index = x_train.columns, columns=['importance'])
feature_importances

Unnamed: 0,importance
age,0.444847
driver,0.025597
passenger,0.025116
pedestrian,0.12988
female,0.011688
male,0.012966
bicycle,0.101521
bus,0.007733
car,0.155882
motorbike,0.04879


# Interpretation

The Random Forest Classifier is performing relatively well (89% accuracy). Since a Random Forest basically acts like cross-validation, I did not perform cross-validation for this classification. To improve the classification, I removed several variable classes such as the pedestrian movement, the pedestrian location, and the year.

As seen in the table above, the age of the person as well as whether he was the pedestrian or traveling in a car or bicycle mattered most. This is very intuitive as the older the person is, the more severe the casualty should be. Additionally, if one is the pedestrian and is hit by a car or bicycle, it will be much more severe (and also likely).