In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [11]:
train = pd.read_csv("preprocessed_train_data.csv", index_col = 0)
test = pd.read_csv("preprocessed_test_data.csv", index_col = 0)

trained_tweets = train['keyword']+train['text']
test_tweets = test['keyword']+test['text']

In [12]:
state = 12  
test_size = 0.30
  
X_train, X_test, y_train, y_test = train_test_split(trained_tweets, train['target'],  
    test_size=test_size, random_state=state)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

word_vectorizer = TfidfVectorizer(
    analyzer='word',
    stop_words='english',
    ngram_range=(1, 3),
    lowercase=True,
    min_df=5,
    max_features=30000)

char_vectorizer = TfidfVectorizer(
    analyzer='char',
    stop_words='english',
    ngram_range=(3, 6),
    lowercase=True,
    min_df=5,
    max_features=50000)

vectorizer = FeatureUnion([('word_vectorizer', word_vectorizer),  ('char_vectorizer', char_vectorizer)])
vectorizer.fit(X_train)

X_train_vectors = vectorizer.transform(X_train).toarray()
X_test_vectors = vectorizer.transform(X_test).toarray()
test_vec = vectorizer.transform(test_tweets).toarray()
print(X_train_vectors.shape, X_test_vectors.shape)

(5329, 44132) (2284, 44132)


In [14]:
model = RandomForestClassifier(n_estimators=100, 
                               random_state=50, 
                               max_features = 'sqrt',
                               n_jobs=-1, verbose = 1)
model.fit(X_train_vectors, y_train)
predictions = model.predict(X_test_vectors)

print("Accuracy score (training): {0:.3f}".format(model.score(X_train_vectors, y_train)))
print("Accuracy score (validation): {0:.3f}".format(model.score(X_test_vectors, y_test)))
print("F1 Score: {0:.3f}".format(f1_score(y_test,predictions)))
print("Recall: {0:.3f}".format(recall_score(y_test,predictions)))
print("Precision: {0:.3f}".format(precision_score(y_test,predictions)))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   37.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.4min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.5s finished


Accuracy score (training): 0.989


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s


Accuracy score (validation): 0.787
F1 Score: 0.728
Recall: 0.688
Precision: 0.773


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished


In [10]:
model = RandomForestClassifier(n_estimators=100, 
                               random_state=50, 
                               max_features = 100,
                               n_jobs=-1, verbose = 1, warm_start = True)
model.fit(X_train_vectors, y_train)
predictions = model.predict(X_test_vectors)

print("Accuracy score (training): {0:.3f}".format(model.score(X_train_vectors, y_train)))
print("Accuracy score (validation): {0:.3f}".format(model.score(X_test_vectors, y_test)))
print("F1 Score: {0:.3f}".format(f1_score(y_test,predictions)))
print("Recall: {0:.3f}".format(recall_score(y_test,predictions)))
print("Precision: {0:.3f}".format(precision_score(y_test,predictions)))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   50.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.7s finished


Accuracy score (training): 0.989


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s


Accuracy score (validation): 0.786
F1 Score: 0.721
Recall: 0.668
Precision: 0.784


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished


In [15]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission['target'] = model.predict(test_vec)
sample_submission.to_csv("submissionRandomForest.csv", index=False)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.3s finished


In [17]:
maxfeatures = [65,70, 75, 80, 85]

for MF in maxfeatures:
    model = RandomForestClassifier(n_estimators=100, 
                               random_state=50, 
                               max_features = MF,
                               n_jobs=-1, verbose = 1)
    model.fit(X_train_vectors, y_train)
    predictions = model.predict(X_test_vectors)
    
    print(MF)
    print( )
    print("Accuracy score (training): {0:.3f}".format(model.score(X_train_vectors, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(model.score(X_test_vectors, y_test)))
    print("F1 Score: {0:.3f}".format(f1_score(y_test,predictions)))
    print("Recall: {0:.3f}".format(recall_score(y_test,predictions)))
    print("Precision: {0:.3f}".format(precision_score(y_test,predictions)))
    print( )

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   37.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished


65



[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.6s finished


Accuracy score (training): 0.989


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.3s finished


Accuracy score (validation): 0.790
F1 Score: 0.722
Recall: 0.657
Precision: 0.802



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   37.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished


70



[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.5s finished


Accuracy score (training): 0.989


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished


Accuracy score (validation): 0.786
F1 Score: 0.719
Recall: 0.657
Precision: 0.793



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   40.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.3s finished


75



[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.5s finished


Accuracy score (training): 0.989


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished


Accuracy score (validation): 0.789
F1 Score: 0.721
Recall: 0.658
Precision: 0.798



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   40.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.3s finished


80



[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.5s finished


Accuracy score (training): 0.989


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished


Accuracy score (validation): 0.791
F1 Score: 0.727
Recall: 0.670
Precision: 0.794



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   55.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished


85



[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.6s finished


Accuracy score (training): 0.989


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s


Accuracy score (validation): 0.787
F1 Score: 0.722
Recall: 0.666
Precision: 0.789



[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished


In [18]:
model = RandomForestClassifier(n_estimators=100, 
                               random_state=50, 
                               max_features = 80,
                               n_jobs=-1, verbose = 1, warm_start = True)
model.fit(X_train_vectors, y_train)
predictions = model.predict(X_test_vectors)

print("Accuracy score (training): {0:.3f}".format(model.score(X_train_vectors, y_train)))
print("Accuracy score (validation): {0:.3f}".format(model.score(X_test_vectors, y_test)))
print("F1 Score: {0:.3f}".format(f1_score(y_test,predictions)))
print("Recall: {0:.3f}".format(recall_score(y_test,predictions)))
print("Precision: {0:.3f}".format(precision_score(y_test,predictions)))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   46.8s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.5s finished


Accuracy score (training): 0.989


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s


Accuracy score (validation): 0.791
F1 Score: 0.727
Recall: 0.670
Precision: 0.794


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.4s finished


In [19]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission['target'] = model.predict(test_vec)
sample_submission.to_csv("2-RandomForestsubmission.csv", index=False)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.3s finished


In [25]:
model = RandomForestClassifier(n_estimators=130, 
                               random_state=50, 
                               max_features = 80,
                               max_depth = 100,
                               n_jobs=-1, verbose = 1, warm_start = True)
model.fit(X_train_vectors, y_train)
predictions = model.predict(X_test_vectors)

print("Accuracy score (training): {0:.3f}".format(model.score(X_train_vectors, y_train)))
print("Accuracy score (validation): {0:.3f}".format(model.score(X_test_vectors, y_test)))
print("F1 Score: {0:.3f}".format(f1_score(y_test,predictions)))
print("Recall: {0:.3f}".format(recall_score(y_test,predictions)))
print("Precision: {0:.3f}".format(precision_score(y_test,predictions)))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done 130 out of 130 | elapsed:   31.8s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 130 out of 130 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 130 out of 130 | elapsed:    0.6s finished


Accuracy score (training): 0.943


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s


Accuracy score (validation): 0.792
F1 Score: 0.723
Recall: 0.654
Precision: 0.807


[Parallel(n_jobs=4)]: Done 130 out of 130 | elapsed:    0.2s finished


In [36]:
model = RandomForestClassifier(n_estimators=130, 
                               random_state=50, 
                               max_features = 50,
                               max_depth = 100,
                               n_jobs=-1, verbose = 1, warm_start = True)
model.fit(X_train_vectors, y_train)
predictions = model.predict(X_test_vectors)

print("Accuracy score (training): {0:.3f}".format(model.score(X_train_vectors, y_train)))
print("Accuracy score (validation): {0:.3f}".format(model.score(X_test_vectors, y_test)))
print("F1 Score: {0:.3f}".format(f1_score(y_test,predictions)))
print("Recall: {0:.3f}".format(recall_score(y_test,predictions)))
print("Precision: {0:.3f}".format(precision_score(y_test,predictions)))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 130 out of 130 | elapsed:   27.9s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 130 out of 130 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 130 out of 130 | elapsed:    0.8s finished


Accuracy score (training): 0.936


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s


Accuracy score (validation): 0.789
F1 Score: 0.709
Recall: 0.620
Precision: 0.828


[Parallel(n_jobs=4)]: Done 130 out of 130 | elapsed:    0.2s finished


In [37]:
model = RandomForestClassifier(n_estimators=130, 
                               random_state=50, 
                               max_features = 50,
                               max_depth = 200,
                               n_jobs=-1, verbose = 1, warm_start = True)
model.fit(X_train_vectors, y_train)
predictions = model.predict(X_test_vectors)

print("Accuracy score (training): {0:.3f}".format(model.score(X_train_vectors, y_train)))
print("Accuracy score (validation): {0:.3f}".format(model.score(X_test_vectors, y_test)))
print("F1 Score: {0:.3f}".format(f1_score(y_test,predictions)))
print("Recall: {0:.3f}".format(recall_score(y_test,predictions)))
print("Precision: {0:.3f}".format(precision_score(y_test,predictions)))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 130 out of 130 | elapsed:   32.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 130 out of 130 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 130 out of 130 | elapsed:    0.6s finished


Accuracy score (training): 0.984


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s


Accuracy score (validation): 0.792
F1 Score: 0.722
Recall: 0.648
Precision: 0.814


[Parallel(n_jobs=4)]: Done 130 out of 130 | elapsed:    0.3s finished


In [38]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission['target'] = model.predict(test_vec)
sample_submission.to_csv("3-RandomForestsubmission.csv", index=False)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 130 out of 130 | elapsed:    0.6s finished
