# 4. Random Forest

## Questions 3 & 4

### 1. Read the csv files got from the previous question.

In [2]:
import pandas as pd
df_train = pd.read_csv('d1_train_augmented.csv')
df_val = pd.read_csv('d1_val_augmented.csv')

In [3]:
df_train

Unnamed: 0,text,disaster_label,predicted_sentiment_label
0,New #photo Oak in a snowstorm http://t.co/HK9Y...,1.0,0
1,'You can only be rescued from where you actual...,0.0,0
2,@runner_joy yes; especially new clients that w...,0.0,1
3,Dear @CanonUSAimaging I brought it ;) #CanonBr...,0.0,0
4,**Let - Me - Be - Your - Hot - Blazing - Fanta...,0.0,2
...,...,...,...
6085,nothing surprises me anymore and i am sure the...,0.0,2
6086,Is Stuart Broad the Prime Minister yet. Best t...,0.0,0
6087,Preacher faces UK terrorism charges http://t.c...,1.0,1
6088,Suggs &amp; Vivian to the rescue! #psychrewatch,0.0,2


In [4]:
df_val

Unnamed: 0,text,disaster_label,predicted_sentiment_label
0,Having trouble understanding the rotations wit...,0.0,0
1,Wow Crackdown 3 uses multiple servers in multi...,0.0,0
2,Heart disease prevention: What about secondhan...,0.0,0
3,Cape Coral city leaders take part in mock hurr...,1.0,0
4,#np Avenged Sevenfold - Hail To The King,0.0,1
...,...,...,...
1518,Fukushima Nuclear Disaster | Increased Thyroid...,1.0,1
1519,#LukeBox something about first responders/ mil...,0.0,2
1520,GREAT MICHIGAN TECHNIQUE CAMP\nB1G THANKS TO @...,1.0,0
1521,Diageo's CEO stresses that a board revolt at U...,0.0,1


### 2. Import relevant libraries for vectorizing the text data and f1_score and RF classifier.

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score

In [6]:
# Preprocessing the data for further use
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(df_train['text'])
X_validation = vectorizer.transform(df_val['text'])
y_train = df_train['disaster_label']
y_validation = df_val['disaster_label']

In this case, we choose scikit-optimize for parameter tuning.

In [7]:
!pip install scikit-optimize

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9
  Downloading pyaml-21.10.1-py2.py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-21.10.1 scikit-optimize-0.9.0


In [8]:
# The procedures for defining the search space ([0,1]) and import relevant optimizing functions
from skopt import gp_minimize
from skopt.space import Real
from skopt.utils import use_named_args
search_space = [
    Real(0, 1, name="lambda_0"),
    Real(0, 1, name="lambda_n"),
    Real(0, 1, name="lambda_p"),
]

In [9]:
@use_named_args(search_space)
def evaluate_model(**params):
    lambda_0, lambda_n, lambda_p = params['lambda_0'], params['lambda_n'], params['lambda_p']
    
    # Doing weight mapping to different lambdas
    weight_mapping = {0: lambda_0, 1: lambda_n, 2: lambda_p}
    df_train['weights'] = df_train['predicted_sentiment_label'].map(weight_mapping)

    # Train a random forest classifier and predict on the validation set
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train, y_train, sample_weight=df_train['weights'])
    y_pred = rf.predict(X_validation)

    # Calculating the f1 score
    f1 = f1_score(y_validation, y_pred)
    # Since we using "minimize", we need to return the value that with the lowest -f1 score so that the f1 will be the highest.
    return -f1


In [10]:
# Run optimization using relative parameters
result = gp_minimize(evaluate_model, search_space, n_calls=10, random_state=42)

In [11]:
# Get a collection of the best lambdas and return their values and the F1 score for the validation set.
best_params = {name: value for name, value in zip(["lambda_0", "lambda_n", "lambda_p"], result.x)}
print("Best lambdas: ", best_params)
print("Best F1 score: ", -result.fun)

Best lambdas:  {'lambda_0': 0.7965429868602331, 'lambda_n': 0.18343478986616382, 'lambda_p': 0.7796910002727695}
Best F1 score:  0.7210762331838565


### 3. Doing a test on the test dataset utilizing the parameters attained above.

In [12]:
# Importing test dataset
d1_test = pd.read_csv('test.csv')
d1_test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [13]:
X_test = vectorizer.transform(d1_test['text'])

In [14]:
# Retrieve relevant lambdas from the above question, and do a weight mapping and train the rf model on the train set.
lambda_0, lambda_n, lambda_p = best_params['lambda_0'], best_params['lambda_n'], best_params['lambda_p']
weight_mapping = {0: lambda_0, 1: lambda_n, 2: lambda_p}
df_train['weights'] = df_train['predicted_sentiment_label'].map(weight_mapping)

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train, sample_weight=df_train['weights'])

In [15]:
d1_test['target'] = rf.predict(X_test) # Do a prediction

In [16]:
d1_test['target'] = d1_test['target'].astype(int)
d1_test.head()

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,1
1,2,,,"Heard about #earthquake is different cities, s...",1
2,3,,,"there is a forest fire at spot pond, geese are...",1
3,9,,,Apocalypse lighting. #Spokane #wildfires,0
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,1


In [17]:
# Finally form a dataset for prediction for submission on Kaggle.
# d1_test[['id', 'target']].to_csv('team_whs_rf_submission1.csv', index=False)