In [1]:
# Install snownlp if you didn't use it before 
from snownlp import sentiment
from snownlp import SnowNLP
import pandas as pd
import numpy as np

# Re-train the snownlp model

In [2]:
# load the data for training ,positve and negative text we generate in step 1
negative_txt = "./neg_data0331.txt"
positive_txt = "./pos_data0331.txt"
neg_data0331 = pd.read_csv(negative_txt,sep='\n',error_bad_lines=False,encoding = "utf-8")
pos_data0331 = pd.read_csv(positive_txt,sep='\n',error_bad_lines=False,encoding = "utf-8")
# See how big the size of the data is.
print(np.shape(neg_data0331))
print(np.shape(pos_data0331))

(3774552, 1)
(4671653, 1)


In [3]:
# if we just put all the data for training, it will take a very long long time,exceed 10 hours we guess.
# And the convergency rate is very low when data size is large.
# So we define a function to choose sample of the sourcedata for training.
def choose_sample(q,neg_data,pos_data):
    neg_sample_idx = np.random.randint(0, len(neg_data), q)
    pos_sample_idx = np.random.randint(0, len(pos_data), q)
    pos_sample_data = np.array(pos_data)[pos_sample_idx]
    neg_sample_data = np.array(neg_data)[neg_sample_idx]
    return pos_sample_data,neg_sample_data

In [4]:
# Here we random select 100000 data for each type(positive and negative)
pos_data,neg_data=choose_sample(100000,neg_data0331,pos_data0331)

In [5]:
# Then randomly choose 1/3 of the choosen data  as the train set
np.random.seed(123)
neg_test_idx = np.random.randint(0, len(neg_data), len(neg_data) // 3)
pos_test_idx = np.random.randint(0, len(pos_data), len(pos_data) // 3)
#print(neg_test_idx,pos_test_idx)
print(np.shape(neg_test_idx),np.shape(pos_test_idx))

(33333,) (33333,)


In [6]:
# training data
pos_train_data = np.delete(np.array(pos_data), pos_test_idx, axis=0)
neg_train_data = np.delete(np.array(neg_data), neg_test_idx, axis=0)
print(np.shape(pos_train_data))
print(np.shape(neg_train_data))

(71719, 1)
(71479, 1)


In [7]:
# testing data
pos_test_data = np.array(pos_data)[pos_test_idx]
neg_test_data = np.array(neg_data)[neg_test_idx]

In [8]:
# save the test data to txt files for accuracy calculation in Step 3
pos_test_df = pd.DataFrame(pos_test_data)
neg_test_df = pd.DataFrame(neg_test_data)
pos_test_df.to_csv('./pos_test_data.txt',index=False,header = None,encoding='UTF-8')
neg_test_df.to_csv('./neg_test_data.txt',index=False,header = None,encoding='UTF-8')

In [9]:
# save the train data to txt files for training
pos_train_df = pd.DataFrame(pos_train_data)
neg_train_df = pd.DataFrame(neg_train_data)
pos_train_df.to_csv('./pos_train_data.txt',index=False,header = None,encoding='UTF-8')
neg_train_df.to_csv('./neg_train_data.txt',index=False,header = None,encoding='UTF-8')

In [10]:
# load the train data file
train_negative_txt = "./neg_train_data.txt"
train_positive_txt = "./pos_train_data.txt"
sentiment.train(train_negative_txt,train_positive_txt)
sentiment.save("./sentiment.marshal")


After you get the new model, there are two ways to use this model:
The first one is to change the "_init_" file in your "./site-packages/snownlp/sentiment" directory, you need to modify your data_path to your new model's directory;
The second one is delete the original model and put the new model(renamed as "sentiment.marshal" in your "./site-packages/snownlp/sentiment" directory.