In [None]:
import sys,csv
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.pipeline import (make_pipeline, Pipeline)
from sklearn.metrics import make_scorer
from sklearn.model_selection import (train_test_split, cross_val_score, GridSearchCV)
from sklearn.utils import resample
import pandas as pd
import itertools

In [None]:
#pip install thulac
import thulac
thu = thulac.thulac(T2S=True, seg_only=True)

In [None]:
def combine_configuration():
    Vectorizers = [CountVectorizer, TfidfVectorizer]
    Classifiers = [#MultinomialNB(), 
                   #LogisticRegression(max_iter=1000),
                   SVC(kernel='rbf'), 
                   SVC(kernel='linear')
                  ]
    config = [Vectorizers, Classifiers]
    configurations = list(itertools.product(*config))
    return configurations

def down_sample_majority(df, majortopic, downsample):
        majority = int(len(df[df[majortopic]==0])/len(df)<0.5) # when the ratio of label=0 < .5, majority = 1, else majority = 0
        monority = 1 - majority # if majority = 1 then minority = 0, and vice versa
        df_majority = df[df[majortopic]==majority]
        df_minority = df[df[majortopic]==monority]
        n = int(len(df_minority))*downsample
        if n > len(df_majority):
            n = len(df_majority)
        df_majority_downsampled = resample(df_majority,
                                         replace=False,     #
                                         n_samples=n # set to N of minority topic
                                        # random_state=123) #
                                        )
        df_downsampled = pd.concat([df_minority, df_majority_downsampled])
        print(len(df_minority))
        return df_downsampled
    
def machine_learning(df, labels, downsample = 0):
    acc = pd.DataFrame(columns = ['Vectorizer', 'Classifier','Parameters', 'F1','Precision','Recall','Accuracy','Ratio'])

    if downsample > 0:
        df = down_sample_majority(df, labels, downsample)

    train_texts, test_texts, train_labels, test_labels = train_test_split(df['seg'].to_list(), df[labels].to_list(), test_size=0.2)

    # scorer = make_scorer(metrics.precision_score, pos_label=1, zero_division=0)
    # if len(Counter(train_labels)) > 2:
    #     scorer = 'accuracy'

    scorer = 'accuracy'
    
    print(f'train: {len(train_labels)}, test: {len(test_labels)}')
    print(Counter(train_labels))
    
    CV = CountVectorizer()
    TV = TfidfVectorizer()

    
    configurations = combine_configuration()
    acc_max = 0
    for vectorizer, classifier in configurations:
        pipeline = Pipeline(steps = [
          ("vectorizer", vectorizer()), 
          ("classifier", classifier)])

        grid = {"vectorizer__ngram_range": [(1,1), (1,2)],
                "vectorizer__max_df": [0.5, 1.0],
                "vectorizer__min_df": [0, 5],
                "classifier__C": [0.01, 1, 100]
               }
        print(vectorizer, classifier)
        try:
            search=GridSearchCV(estimator=pipeline, n_jobs=-1, param_grid=grid, scoring=scorer, cv=5)
            search.fit(train_texts, train_labels)
        except:
            #print('regularization is not applicable')
            grid.pop('classifier__C')
            search=GridSearchCV(estimator=pipeline, n_jobs=-1, param_grid=grid, scoring=scorer, cv=5)
            search.fit(train_texts, train_labels)
        y_pred = search.predict(test_texts)
        # print(y_pred.mean())
        # y_pred = search.predict(train_texts)
        acc = acc.append({'Vectorizer':vectorizer, 'Classifier':classifier,'Parameters':search.best_params_, 
                          #   'Precision':metrics.precision_score(test_labels,y_pred),
                          #   'Recall':metrics.recall_score(test_labels,y_pred),
                          # 'F1':metrics.f1_score(test_labels,y_pred),
                                    'Accuracy':metrics.accuracy_score(test_labels,y_pred),
                                    'Ratio':Counter(train_labels)[1]/(len(train_labels))},ignore_index=True)
        if metrics.accuracy_score(test_labels,y_pred) > acc_max:
            search_max = search
    return acc, search_max


segmentation for Weibo was done and saved befored by also using ".apply(lambda x: thu.cut(x, text=True))" 

I don't include it here because it costs hours to process

In [None]:
weibo_posts = pd.read_csv('data/weibo_posts_moral_seg.csv')
weibo_posts.head()

In [None]:
zhihu_posts = pd.read_csv('data/zhihu_answers_moral.csv')
zhihu_posts['seg'] = zhihu_posts['post_content'].apply(lambda x: thu.cut(x, text=True))

## Machine learning

In [None]:
data = pd.read_excel('annotation.xlsx')
len(data)

In [None]:
data['seg'] = data['post_content'].apply(lambda x: thu.cut(x, text=True))
data['pure_fem'] = data['Stance_post'].apply(lambda x: 1 if x == 1 else 0)
train_data, test_data = train_test_split(data, test_size=0.2)

In [None]:
label = 'Relevant'
acc, search = machine_learning(train_data,label,downsample = 2)
pred = search.predict(test_data['seg'])
print(metrics.classification_report(test_data[label],pred))
# weibo_posts['pred_relevant'] = search.predict(weibo_posts['seg'])

In [None]:
zhihu_posts['pred_relevant'] = search.predict(zhihu_posts['seg'])

In [None]:
label = 'Stance_post'
acc, search_stance = machine_learning(train_data,label,downsample = 0)
pred_stance = search_stance.predict(test_data['seg'])
print(metrics.classification_report(test_data[label],pred_stance))
# weibo_posts['pred_stance'] = search_stance.predict(weibo_posts['seg'])

In [None]:
zhihu_posts['pred_stance'] = search_stance.predict(zhihu_posts['seg'])

In [None]:
label = 'Mention_feminist'
acc, search_mfem = machine_learning(train_data,label,downsample = 1)
pred_mfem = search_mfem.predict(test_data['seg'])
print(metrics.classification_report(test_data[label],pred_mfem))
# weibo_posts['pred_mfem'] = search_mfem.predict(weibo_posts['seg'])
zhihu_posts['pred_mfem'] = search_mfem.predict(zhihu_posts['seg'])

In [None]:
label = 'Mention_antifeminist'
acc, search_matfem = machine_learning(train_data,label,downsample = 1)
pred_matfem = search_matfem.predict(test_data['seg'])
print(metrics.classification_report(test_data[label],pred_matfem))

In [None]:
# weibo_posts['pred_matfem'] = search_matfem.predict(weibo_posts['seg'])
zhihu_posts['pred_matfem'] = search_matfem.predict(zhihu_posts['seg'])

In [None]:
label = 'Engagement'
acc, search_egm = machine_learning(train_data,label,downsample = 1)
pred_egm = search_egm.predict(test_data['seg'])
print(metrics.classification_report(test_data[label],pred_egm))
# weibo_posts['pred_engagement'] = search_egm.predict(weibo_posts['seg'])
zhihu_posts['pred_engagement'] = search_egm.predict(zhihu_posts['seg'])

In [None]:
weibo_posts.to_csv('weibo_posts_ML.csv',index=False)

In [None]:
zhihu_posts.to_csv('zhihu_posts_ML.csv',index=False)

## Machine learning for comments

In [None]:
comments = data[(data['Is_comment']==1) & (data['Relevant']==1)]
stance_dict = {0:'无立场',1:'明确 的 女权主义者',2:'女权主义者 但 对 其他 女权主义者 表示 不满',3:'反 女权主义者'}
for i, comment in comments.iterrows():
    comments.loc[i,'content'] = f"被评论的文本为：{comment['post_content']}。被评论者的立场为：{stance_dict[comment['Stance_post']]}。评论内容为：{comment['comment_content']}"
comments['seg'] = comments['content'].apply(lambda x: thu.cut(x, text=True))
train_data_cm, test_data_cm = train_test_split(comments, test_size=0.2)

In [None]:
label = 'Stance_comment'
acc, search_stance_cm = machine_learning(train_data_cm,label,downsample = 0)
pred_stance_cm = search_stance_cm.predict(test_data_cm['seg'])
print(metrics.classification_report(test_data_cm[label],pred_stance_cm))

In [None]:
weibo_comments['pred_stance_comment'] = search_stance_cm.predict(weibo_comments['seg_content'])

In [None]:
weibo_comments.drop('seg_content', axis=1).to_csv('weibo_comments_ML.csv',index=False)

## Comments segmentation

In [None]:
from tqdm import tqdm
import re

In [None]:
weibo_comments = pd.read_csv('data/weibo_comments_moral_seg.csv')

In [None]:
weibo_posts = pd.read_csv('weibo_posts_ML.csv')

In [None]:
stance_dict = {0:'无立场',1:'明确 的 女权主义者',2:'女权主义者 但 对 其他 女权主义者 表示 不满',3:'反 女权主义者'}

In [None]:
for i, comment in tqdm(weibo_comments.iterrows()):
    match = weibo_posts[weibo_posts['mid']==comment['answer_id']]
    try:
        relevant = match['pred_relevant'].values[0]
        post_stance = match['pred_stance'].values[0]
        strance_str = stance_dict[post_stance]
    except:
        continue
    if comment['pred_relevant'] == relevant:
        continue
    weibo_comments.loc[i,'pred_relevant'] = relevant
    weibo_comments.loc[i,'pred_stance_post'] = post_stance
    # seg_comment = thu.cut(comment['comment_content'], text=True)
    seg_comment = re.sub(r"被 评论者 的 立场 为 ： .* 。 评论 内容 为 ： ",f"被 评论者 的 立场 为 ： {strance_str} 。 评论 内容 为 ： ", comment['seg_content'])
    weibo_comments.loc[i,'seg_content'] = seg_comment

In [None]:
weibo_comments = weibo_comments[~weibo_comments['pred_relevant'].isna()]

In [None]:
len(weibo_comments)

In [None]:
weibo_comments.to_csv('weibo_comments_moral_seg.csv',index=False)

In [None]:
zhihu_comments = pd.read_csv('data/zhihu_comments_moral.csv')

In [None]:
for i, comment in tqdm(zhihu_comments.iterrows()):
    match = zhihu_posts[zhihu_posts['answer_id']==comment['answer_id']]
    try:
        relevant = match['pred_relevant'].values[0]
        post_stance = match['pred_stance'].values[0]
    except:
        continue
    zhihu_comments.loc[i,'pred_relevant'] = relevant
    zhihu_comments.loc[i,'pred_stance_post'] = post_stance
    text = f"被 评论 的 文本 为 ： {comment['post_content']} 。 被 评论者 的 立场 为 ： {stance_dict[post_stance]} 。 评论 内容 为 ： {comment['comment_content']}"
    seg_comment = thu.cut(text, text=True)
    zhihu_comments.loc[i,'seg_content'] = seg_comment

In [None]:
zhihu_comments = zhihu_comments[~zhihu_comments['seg_content'].isna()]

In [None]:
zhihu_comments['pred_stance_comment'] = search_stance_cm.predict(zhihu_comments['seg_content'])

In [None]:
zhihu_comments.to_csv('zhihu_comments_ML.csv',index=False)