In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb

In [11]:
train = pd.read_csv("preprocessed_train_data.csv", index_col = 0)
test = pd.read_csv("preprocessed_test_data.csv", index_col = 0)

trained_tweets = train['keyword']+train['text']
test_tweets = test['keyword']+test['text']

In [12]:
state = 12  
test_size = 0.30  
  
X_train, X_test, y_train, y_test = train_test_split(trained_tweets, train['target'],  
    test_size=test_size, random_state=state)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

word_vectorizer = TfidfVectorizer(
    analyzer='word',
    stop_words='english',
    ngram_range=(1, 3),
    lowercase=True,
    min_df=5,
    max_features=30000)

char_vectorizer = TfidfVectorizer(
    analyzer='char',
    stop_words='english',
    ngram_range=(3, 6),
    lowercase=True,
    min_df=5,
    max_features=50000)

vectorizer = FeatureUnion([('word_vectorizer', word_vectorizer),  ('char_vectorizer', char_vectorizer)])
vectorizer.fit(X_train)

X_train_vectors = vectorizer.transform(X_train).toarray()
X_test_vectors = vectorizer.transform(X_test).toarray()
print(X_train_vectors.shape, X_test_vectors.shape)



MemoryError: Unable to allocate 1.75 GiB for an array with shape (5329, 44132) and data type float64

In [None]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    clf_xgb = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=learning_rate)
    clf_xgb.fit(X_train_vectors, y_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(clf_xgb.score(X_train_vectors, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(clf_xgb.score(X_test_vectors, y_test)))

In [28]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=20, max_depth=5, random_state=0)
    gb_clf.fit(X_train_vectors, y_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train_vectors, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test_vectors, y_test)))

Learning rate:  0.05
Accuracy score (training): 0.609
Accuracy score (validation): 0.629
Learning rate:  0.075
Accuracy score (training): 0.636
Accuracy score (validation): 0.650
Learning rate:  0.1
Accuracy score (training): 0.661
Accuracy score (validation): 0.668
Learning rate:  0.25
Accuracy score (training): 0.739
Accuracy score (validation): 0.721
Learning rate:  0.5
Accuracy score (training): 0.767
Accuracy score (validation): 0.734
Learning rate:  0.75
Accuracy score (training): 0.766
Accuracy score (validation): 0.728
Learning rate:  1
Accuracy score (training): 0.769
Accuracy score (validation): 0.718


In [29]:
lr_list = [0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=100, max_depth=10, random_state=0)
    gb_clf.fit(X_train_vectors, y_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train_vectors, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test_vectors, y_test)))

Learning rate:  0.25
Accuracy score (training): 0.869
Accuracy score (validation): 0.765
Learning rate:  0.5
Accuracy score (training): 0.903
Accuracy score (validation): 0.746
Learning rate:  0.75
Accuracy score (training): 0.911
Accuracy score (validation): 0.747
Learning rate:  1
Accuracy score (training): 0.910
Accuracy score (validation): 0.731


In [32]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=learning_rate, max_features=60, max_depth=6, random_state=0)
    gb_clf.fit(X_train_vectors, y_train)
    
    predictions = gb_clf.predict(X_test_vectors)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train_vectors, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test_vectors, y_test)))
    print("F1 Score: {0:.3f}".format(f1_score(y_test,predictions)))
    print("Recall: {0:.3f}".format(recall_score(y_test,predictions)))
    print("Precision: {0:.3f}".format(precision_score(y_test,predictions)))
    

Learning rate:  0.05
Accuracy score (training): 0.797
Accuracy score (validation): 0.756
F1 Score: 0.623
Recall: 0.486
Precision: 0.867
Learning rate:  0.075
Accuracy score (training): 0.835
Accuracy score (validation): 0.772
F1 Score: 0.665
Recall: 0.545
Precision: 0.852
Learning rate:  0.1
Accuracy score (training): 0.854
Accuracy score (validation): 0.779
F1 Score: 0.687
Recall: 0.583
Precision: 0.835
Learning rate:  0.25
Accuracy score (training): 0.920
Accuracy score (validation): 0.782
F1 Score: 0.713
Recall: 0.650
Precision: 0.789
Learning rate:  0.5
Accuracy score (training): 0.966
Accuracy score (validation): 0.757
F1 Score: 0.688
Recall: 0.647
Precision: 0.736
Learning rate:  0.75
Accuracy score (training): 0.974
Accuracy score (validation): 0.762
F1 Score: 0.701
Recall: 0.671
Precision: 0.734
Learning rate:  1
Accuracy score (training): 0.976
Accuracy score (validation): 0.749
F1 Score: 0.690
Recall: 0.674
Precision: 0.707


In [33]:
#Highest is 0.05
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=learning_rate, max_features=70, max_depth=20, random_state=0)
    gb_clf.fit(X_train_vectors, y_train)
    
    predictions = gb_clf.predict(X_test_vectors)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train_vectors, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test_vectors, y_test)))
    print("F1 Score: {0:.3f}".format(f1_score(y_test,predictions)))
    print("Recall: {0:.3f}".format(recall_score(y_test,predictions)))
    print("Precision: {0:.3f}".format(precision_score(y_test,predictions)))
    

Learning rate:  0.05
Accuracy score (training): 0.954
Accuracy score (validation): 0.793
F1 Score: 0.720
Recall: 0.642
Precision: 0.820
Learning rate:  0.075
Accuracy score (training): 0.976
Accuracy score (validation): 0.790
F1 Score: 0.721
Recall: 0.653
Precision: 0.805
Learning rate:  0.1
Accuracy score (training): 0.986
Accuracy score (validation): 0.789
F1 Score: 0.723
Recall: 0.664
Precision: 0.795
Learning rate:  0.25
Accuracy score (training): 0.989
Accuracy score (validation): 0.786
F1 Score: 0.721
Recall: 0.667
Precision: 0.785
Learning rate:  0.5
Accuracy score (training): 0.989
Accuracy score (validation): 0.782
F1 Score: 0.719
Recall: 0.672
Precision: 0.773
Learning rate:  0.75
Accuracy score (training): 0.989
Accuracy score (validation): 0.773
F1 Score: 0.706
Recall: 0.657
Precision: 0.763
Learning rate:  1
Accuracy score (training): 0.988
Accuracy score (validation): 0.763
F1 Score: 0.696
Recall: 0.652
Precision: 0.745


In [34]:
lr_list = [0.03, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=120, learning_rate=learning_rate, max_features=100, max_depth=25, random_state=0)
    gb_clf.fit(X_train_vectors, y_train)
    
    predictions = gb_clf.predict(X_test_vectors)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train_vectors, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test_vectors, y_test)))
    print("F1 Score: {0:.3f}".format(f1_score(y_test,predictions)))
    print("Recall: {0:.3f}".format(recall_score(y_test,predictions)))
    print("Precision: {0:.3f}".format(precision_score(y_test,predictions)))
    

Learning rate:  0.03
Accuracy score (training): 0.961
Accuracy score (validation): 0.787
F1 Score: 0.714
Recall: 0.640
Precision: 0.806
Learning rate:  0.05
Accuracy score (training): 0.982
Accuracy score (validation): 0.789
F1 Score: 0.723
Recall: 0.664
Precision: 0.794
Learning rate:  0.075
Accuracy score (training): 0.989
Accuracy score (validation): 0.797
F1 Score: 0.735
Recall: 0.677
Precision: 0.804
Learning rate:  0.1
Accuracy score (training): 0.989
Accuracy score (validation): 0.791
F1 Score: 0.727
Recall: 0.673
Precision: 0.792
Learning rate:  0.25
Accuracy score (training): 0.989
Accuracy score (validation): 0.775
F1 Score: 0.709
Recall: 0.660
Precision: 0.766
Learning rate:  0.5
Accuracy score (training): 0.989
Accuracy score (validation): 0.777
F1 Score: 0.714
Recall: 0.671
Precision: 0.763
Learning rate:  0.75
Accuracy score (training): 0.989
Accuracy score (validation): 0.761
F1 Score: 0.700
Recall: 0.671
Precision: 0.731


KeyboardInterrupt: 