In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from string import punctuation

from sklearn import metrics
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [None]:
# 讀取資料並指定標籤
labels = ['polarity', 'id', 'date', 'query', 'user', 'text']
data = pd.read_csv("/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv", 
                   names=labels,
                   encoding='latin-1')
data = data.dropna()


In [None]:
# 只保留文字內容和極性，將極性改為 0、1
data = data[['text', 'polarity']]
data.polarity.replace(4, 1, inplace=True)

In [None]:
# 創建一個停用詞列表
stops = stopwords.words("english")

# 添加不帶單引號的停用詞
no_quotes = []
for word in stops:
    if "'" in word:
        no_quotes.append(re.sub(r'\'', '', word))
stops.extend(no_quotes)


In [None]:
def clean_string(string):
    # 刪除 HTML 特殊字元
    tmp = re.sub(r'\&\w*;', '', string)
    # 刪除 @user
    tmp = re.sub(r'@(\w+)', '', tmp)
    # 刪除鏈結
    tmp = re.sub(r'(http|https|ftp)://[a-zA-Z0-9\\./]+', 
                 '', 
                 tmp)
    # 轉小寫
    tmp = tmp.lower()
    # 刪除主題標籤
    tmp = re.sub(r'#(\w+)', '', tmp)
    # 刪除重複字元
    tmp = re.sub(r'(.)\1{1,}', r'\1\1', tmp)
    # 刪除任何不是字母的東西
    tmp = re.sub("[^a-zA-Z]", " ", tmp)
    # 刪除少於兩個字元的任何內容
    tmp = re.sub(r'\b\w{1,2}\b', '', tmp)
    # 刪除多個空格
    tmp = re.sub(r'\s\s+', ' ', tmp)
    return tmp


In [None]:
def preprocess(string):

    stemmer = PorterStemmer()
    # 刪除標點符號
    removed_punc = ''.join([char for char in string 
                            if char not in punctuation])

    cleaned = []
    # 刪除停用詞
    for word in removed_punc.split(' '):
        if word not in stops:
            cleaned.append(stemmer.stem(word.lower()))
    return ' '.join(cleaned)


In [None]:
def check_features_ngrams(features, n_grams, classifiers):

    print(features, n_grams)

    # 初始化 TfidfVectorizer 函式
    tf = TfidfVectorizer(max_features = features, 
                         ngram_range = n_grams,
                         stop_words = 'english')

    # 將文字資料轉換成數值向量
    tf.fit(data.text)
    transformed = tf.transform(data.text)

    np.random.seed(123456)

    def check_classifier(name, classifier):
        print('--'+name+'--')

        # 將稀疏矩陣轉換成numpy矩陣
        x_data = transformed[:train_size].toarray()
        y_data = data.polarity[:train_size].values

        # 訓練基學習器
        classifier.fit(x_data, y_data)
        i_s = metrics.accuracy_score(y_data, 
                                     classifier.predict(x_data))

        # 在測試集上評估基學習器效能
        x_data = transformed[test_start:test_end].toarray()
        y_data = data.polarity[test_start:test_end].values
        oos = metrics.accuracy_score(y_data, 
                                     classifier.predict(x_data))

        # 匯出結果
        with open("outs.txt","a") as f:
            f.write(str(features)+',')
            f.write(str(n_grams[-1])+',')
            f.write(name+',')
            f.write('%.4f'%i_s+',')
            f.write('%.4f'%oos+'\n')

    for name, classifier in classifiers:
        check_classifier(name, classifier)


In [None]:
data = data.sample(frac=1).reset_index(drop=True)
data.text = data.text.apply(clean_string)
data.text = data.text.apply(preprocess)

train_size = 10000
test_start = 10000
test_end = 100000

# 創建 csv 標頭
with open("outs.txt","a") as f:
    f.write('features,ngram_range,classifier')
    f.write('train_acc,test_acc\n')

# 測試所有特徵和 n-連字串組合
for features in [500, 1000, 5000, 10000, 20000, 30000]:
    for n_grams in [(1, 1), (1, 2), (1, 3)]:

        # 初始化集成模型
        voting = VotingClassifier([('DT', 
                                    DecisionTreeClassifier()),
                                   ('NB',
                                    MultinomialNB()),
                                   ('Ridge', 
                                    RidgeClassifier())])

        # 整合集成模型與單一基學習器
        classifiers = [('DT', 
                        DecisionTreeClassifier()),
                       ('NB', 
                        MultinomialNB()),
                       ('Ridge', 
                        RidgeClassifier()),
                       ('Voting', 
                        voting)]

        # 訓練模型
        check_features_ngrams(features, n_grams, classifiers)


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

text = []
with open("outs.txt", 'r') as f:
   text = f.readlines()
   f.close()

x = [500, 1000, 5000, 10000, 20000, 30000]

for model in ["DT", "NB", "Ridge", "Voting"]:
    gram_1 = []
    gram_2 = []
    gram_3 = []
    for line in text:
        token = line.split(",")
        if(token[2] == model):
            if(int(token[1]) == 1):
                gram_1.append(float(token[4]))
            if(int(token[1]) == 2):
                gram_2.append(float(token[4]))
            if(int(token[1]) == 3):
                gram_3.append(float(token[4]))
    
    plt.figure(figsize = (8, 8))
    plt.plot(x, gram_1, label = "1-gram")
    plt.plot(x, gram_2, label = "2-gram",
             linestyle = "-.")
    plt.plot(x, gram_3, label = "3-gram",
             linestyle = "--")
    plt.xlabel("Features")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.title(model)
    plt.show()
    plt.close()
