### Classification in Python with Random Forest Algorithm


**Task**: Having tweets with: likes, retweets, replies --> predict witch will be viral (0/1 - binary logic)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [25]:
df = pd.read_csv("data/tweets.csv")
df.head()

Unnamed: 0,likes,retweets,replies,is_viral
0,10,2,0,0
1,123,14,5,1
2,203,34,29,1
3,23,2,3,0
4,1,0,0,0


In [26]:
df.shape

(100, 4)

In [27]:
df['is_viral'].value_counts()

is_viral
0    74
1    26
Name: count, dtype: int64


#### Split **horizontally** and **vertically** data ==> Features/Target and Train/Test data

In [29]:
x = df.iloc[:, :-1].values   # all rows, all columns except lastone
y = df.iloc[:, -1].values  # all rows, first column from the right
# x[0:5]

In [30]:
# y[0:5]

In [31]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1234)

In [32]:
# x_train.shape

#### Choice Model, train this and predict values

In [33]:
clf = RandomForestClassifier()
clf.fit(x_train, y_train)  # training the model..

In [34]:
# test prediction manually
viral_predict = clf.predict([[5, 5, 5], [199, 20, 1]])
viral_predict

array([0, 1])

In [35]:
y_pred = clf.predict(x_test)
y_pred

array([0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])

In [36]:
y_test

array([0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])

In [37]:
# design table for better visualization
df_compare = pd.DataFrame(
    data={
        'likes': x_test[:,0],
        'retweets': x_test[:,1],
        'replies': x_test[:,2],
        'predicted_value': y_pred,
        'real_value': y_test
    },
    columns=['likes', 'retweets', 'replies', 'predicted_value', 'real_value'])
df_compare


Unnamed: 0,likes,retweets,replies,predicted_value,real_value
0,3,0,0,0,0
1,69,15,20,1,1
2,299,40,3,1,1
3,8,0,0,0,0
4,9,2,9,0,0
5,156,10,8,1,1
6,184,9,20,1,1
7,4,0,0,0,0
8,5,0,5,0,0
9,301,23,9,1,1


#### Model accuracy assesment

In [40]:
accuracy_score(y_test, y_pred)  # real data, predicted data

1.0

______________________

### Classification in Python with Random Forest Algorithm - Part2

**Task**: Combine two .csv columns in one ==> categorize df

In [6]:
#import pandas as pd
#import numpy as np
#from sklearn.model_selection import train_test_split
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.metrics import accuracy_score

df = pd.read_csv("data/tweets_viral_delete.csv")
df.head()

Unnamed: 0,likes,retweets,replies,is_viral,to_delete
0,10,2,0,0,0
1,123,14,5,1,0
2,203,34,29,1,0
3,23,2,3,0,0
4,1,0,0,0,1


In [9]:
def categorize(row):
    if row['is_viral'] == 1:
        return 'Viral'
    elif row['to_delete'] == 1:
        return 'To Delete'
    else:
        return 'Normal'

df['category'] = df.apply(categorize, axis=1)  # axis=1 means cols, not row
df = df.drop('is_viral', axis=1)
df = df.drop('to_delete', axis=1)
df.head()

Unnamed: 0,likes,retweets,replies,category
0,10,2,0,Normal
1,123,14,5,Viral
2,203,34,29,Viral
3,23,2,3,Normal
4,1,0,0,To Delete


In [11]:
df['category'].value_counts()

category
Normal       58
Viral        26
To Delete    16
Name: count, dtype: int64

--------------
**Task**: modelling, train, predict, evaluate

In [24]:
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

clf = RandomForestClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
y_pred

array(['Normal', 'Normal', 'Viral', 'Normal', 'Normal', 'Normal',
       'Normal', 'To Delete', 'Normal', 'Normal', 'Normal', 'To Delete',
       'Normal', 'Normal', 'Normal', 'Viral', 'Normal', 'Viral', 'Normal',
       'Viral'], dtype=object)

In [25]:
accuracy_score(y_test, y_pred)

0.95

In [26]:
df_compare = pd.DataFrame(
    data={
        'likes': x_test[:, 0],
        'retweets': x_test[:, 1],
        'replies': x_test[:, 2],
        'predicted_value': y_pred,
        'real_value': y_test
    },
    columns=['likes', 'retweets', 'replies', 'predicted_value', 'real_value']
)
df_compare

Unnamed: 0,likes,retweets,replies,predicted_value,real_value
0,29,2,2,Normal,Normal
1,65,4,2,Normal,Normal
2,203,34,29,Viral,Viral
3,28,0,10,Normal,Normal
4,33,3,3,Normal,Normal
5,50,5,5,Normal,Normal
6,0,0,11,Normal,To Delete
7,0,0,5,To Delete,To Delete
8,39,3,9,Normal,Normal
9,48,3,20,Normal,Normal
