In [93]:
import pandas as pd
import numpy as np
import nltk
from functools import reduce
import random
df = pd.read_csv('savedf.csv')

# 1. Preparation for Labeling

# 1.1 Check the sample size
n = df.groupby('Year').agg({"#":"count"})
n['sample'] = n['#']*0.1
print(n)

# 1.2 Generate random integers without replacement to select rows from each year
random.seed(7735)

def sample(l, size):
    sample = random.sample(range(1, l+1), size) 
    return sample

s10 = sample(945, 95)
s11 = sample(867, 87)
s12 = sample(541, 54)
s13 = sample(403, 40)
s14 = sample(341, 34)
s15 = sample(403, 40)
s16 = sample(334, 33)

# 1.3 Select all the training data for labeling
df10 = df[df['Year'] == 2010]
df11 = df[df['Year'] == 2011]
df12 = df[df['Year'] == 2012]
df13 = df[df['Year'] == 2013]
df14 = df[df['Year'] == 2014]
df15 = df[df['Year'] == 2015]
df16 = df[df['Year'] == 2016]

train10 = df10[df10['#'].isin(s10)]
train11 = df11[df11['#'].isin(s11)]
train12 = df12[df12['#'].isin(s12)]
train13 = df13[df13['#'].isin(s13)]
train14 = df14[df14['#'].isin(s14)]
train15 = df15[df15['#'].isin(s15)]
train16 = df16[df16['#'].isin(s16)]

# 1.4 Mannually label train10~train16 in the file label.xlsx
lab = pd.read_excel('label.xlsx')
np.mean(lab)

        #  sample
Year             
2010  945    94.5
2011  867    86.7
2012  541    54.1
2013  403    40.3
2014  341    34.1
2015  403    40.3
2016  334    33.4


Year    2012.216710
#         32.078329
pos        0.665796
dis        0.214099
dtype: float64

In [94]:
# 2. Split the Labeled and Unlabeled Data

def combine(train, Year):
    train.loc[:,['pos', 'dis']] = np.array(lab.loc[lab['Year'] == Year, ['pos', 'dis']])
    return train

labeled = reduce(lambda top, bottom: top.append(bottom), 
               [combine(train10, 2010), combine(train11, 2011), combine(train12, 2012), combine(train13, 2013),
                combine(train14, 2014), combine(train15, 2015), combine(train16, 2016)])

unlabeled_index = list(set(df.index) - set(labeled.index))
unlabeled = df.iloc[unlabeled_index, :]

In [95]:
# 3. Model Selection on the Labeled Data —— Positive Coverage ('pos')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, f1_score

# 3.1 Train-test split
X_train, X_test, y_train, y_test = train_test_split(labeled['Text_truncated'], list(labeled['pos']), random_state=0, test_size=0.05)

vec = CountVectorizer().fit(X_train)
X_train_vec = vec.transform(X_train)
#lname = vec.get_feature_names()
#max(lname, key=len)

# 3.2 Search for the best parameter(s)

mnb = MultinomialNB()
mnb_param = {'alpha': [10, 50, 100, 500, 750, 1000]}

lg = LogisticRegression(solver='liblinear', max_iter=500)
lg_param = {'C':[0.0001, 0.0005, 0.001, 0.01]}
             
clf_pos = GridSearchCV(lg, param_grid=lg_param, scoring='roc_auc', cv=5, iid=True, return_train_score=True).fit(X_train_vec, y_train)

print(clf_pos.cv_results_['mean_test_score'])
print(clf_pos.cv_results_['mean_train_score'])

# 3.3 Check the performance on the test data

pos = clf_pos.predict(vec.transform(X_test))
print(np.mean(pos), np.mean(y_test)) 

def scores(y_pre): 
    rec = recall_score(y_pre, y_test)
    pre = precision_score(y_pre, y_test)
    f1 = f1_score(y_pre, y_test)
    auc = roc_auc_score(y_pre, y_test)
    acc = accuracy_score(y_pre, y_test)

    return pd.DataFrame({'scores':[rec,pre,f1,auc,acc]}, index=['recall','precision','f1','roc_auc','accuracy'])

scores(pos)

# Conclusion: Logistic regression is better at predicting positive coverage 

[0.83515957 0.86370931 0.86835905 0.87530832]
[0.90080009 0.95506418 0.97763172 0.99998937]
0.65 0.65


Unnamed: 0,scores
recall,0.923077
precision,0.923077
f1,0.923077
roc_auc,0.89011
accuracy,0.9


In [96]:
# 4. Model Selection on the Labeled Data —— Labor Disputes ('dis')

# 4.1 Train-test split
X_train, X_test, y_train, y_test = train_test_split(labeled['Text_truncated'], list(labeled['dis']), random_state=0, test_size=0.25)
vec = CountVectorizer().fit(X_train)
X_train_vec = vec.transform(X_train)


# 4.2 Parameter search
mnb_param_d = {'alpha': [0.1, 0.5, 1, 5, 10, 25]}
lg_param_d = {'C':[0.0001, 0.0005, 0.001, 0.01]}

clf_dis = GridSearchCV(mnb, param_grid=mnb_param_d, scoring='roc_auc', cv=5, iid=True, return_train_score=True).fit(X_train_vec, y_train)
print(clf_dis.cv_results_['mean_test_score'])
print(clf_dis.cv_results_['mean_train_score'])

# 4.3 Check performance on test data

dis = clf_dis.predict(vec.transform(X_test))
print(np.mean(dis), np.mean(y_test))

scores(dis)

# Conclusion: Multinomial naive bayes is better at predicting disputes

[0.89909862 0.88626387 0.88387576 0.83998384 0.82088404 0.78561497]
[0.99779613 0.99778468 0.99747485 0.94161013 0.89526415 0.85091727]
0.21875 0.22916666666666666


Unnamed: 0,scores
recall,0.904762
precision,0.863636
f1,0.883721
roc_auc,0.932381
accuracy,0.947917


In [97]:
# 5 Make Predictions Using all Labelled Data
text = labeled['Text_truncated']
ypos = list(labeled['pos'])
ydis = list(labeled['dis'])

vec = CountVectorizer().fit(text)
predictor = vec.transform(text)

cls_pos = GridSearchCV(lg, param_grid=lg_param, cv=5, iid=True, return_train_score=True).fit(predictor, ypos)
print(clf_pos.cv_results_['mean_test_score'])
print(clf_pos.cv_results_['mean_train_score'])

cls_dis = GridSearchCV(mnb, param_grid=mnb_param_d, cv=5, iid=True, return_train_score=True).fit(predictor, ydis)
print(clf_dis.cv_results_['mean_test_score'])
print(clf_dis.cv_results_['mean_train_score'])

pred_pos = cls_pos.predict(vec.transform(unlabeled['Text_truncated']))
pred_dis = cls_dis.predict(vec.transform(unlabeled['Text_truncated']))

np.mean(pred_pos), np.mean(pred_dis)

[0.83515957 0.86370931 0.86835905 0.87530832]
[0.90080009 0.95506418 0.97763172 0.99998937]
[0.89909862 0.88626387 0.88387576 0.83998384 0.82088404 0.78561497]
[0.99779613 0.99778468 0.99747485 0.94161013 0.89526415 0.85091727]


(0.7299333526514054, 0.209214720370907)

In [99]:
# 6. Prepare Data for Regression Analysis

unlabeled.loc[:, "pos"]= pred_pos
unlabeled.loc[:, 'dis'] = pred_dis
cols = labeled.append(unlabeled).groupby(["Year", "Province"])[["pos", "dis"]].mean()
cols

Unnamed: 0_level_0,Unnamed: 1_level_0,pos,dis
Year,Province,Unnamed: 2_level_1,Unnamed: 3_level_1
2010,Anhui,0.771930,0.140351
2010,Beijing,0.800000,0.333333
2010,Chongqing,0.676471,0.058824
2010,Fujian,0.823529,0.235294
2010,Gansu,0.807692,0.115385
2010,Guangdong,0.693548,0.016129
2010,Guangxi,0.681818,0.136364
2010,Guizhou,0.809524,0.095238
2010,Hainan,0.800000,0.400000
2010,Hebei,0.821429,0.357143


In [214]:
cols.to_excel("pos_dis_allyear.xlsx")