# **Task A**

In [1]:
!apt-get update -qq > /dev/null
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!tar xf spark-3.1.1-bin-hadoop2.7.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

import findspark
findspark.init()

from pyspark import SparkContext
sc = SparkContext(appName="YourTest", master="local[*]")

In [3]:
!wget https://ndownloader.figshare.com/files/16188500 -q
!tar -xvf 16188500
!unzip -qn rumoureval2019/rumoureval-2019-training-data.zip

rumoureval2019/
rumoureval2019/final-eval-key.json
rumoureval2019/LICENSE
rumoureval2019/home_scorer_macro.py
rumoureval2019/README
rumoureval2019/rumoureval-2019-training-data.zip
rumoureval2019/rumoureval-2019-test-data.zip


In [4]:
#### IMPORT ####

from pyspark import SparkContext
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import udf, col, expr, explode, struct, regexp_replace, collect_list, lit
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql import SparkSession, Row
from pyspark.ml.feature import * 
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.linalg import VectorUDT, Vectors
from functools import partial
import re


In [5]:
spark = SparkSession.builder.appName("YourTest").getOrCreate()

In [6]:
#### DOWNLOAD SOURCE TWEETS && REPLY TWEETS ###

path = "./rumoureval-2019-training-data/twitter-english/*/*/source-tweet/*.json"
source_tweets_df = spark.read.json(path)
path = "./rumoureval-2019-training-data/twitter-english/*/*/replies/*.json"
reply_tweets_df = spark.read.json(path)

In [7]:
#### DOWNLOAD TRUE LABELS ###

schema = StructType([StructField("subtaskaenglish", MapType(StringType(), StringType())),StructField("subtaskbenglish", MapType(StringType(), StringType()))])
dev_key = spark.read.schema(schema).option("multiline", "true").json('rumoureval-2019-training-data/dev-key.json')
train_key = spark.read.schema(schema).option("multiline", "true").json('rumoureval-2019-training-data/train-key.json')

#### TRUE LABELS FOR TASK A ###

dev_key_taskA = dev_key.select(explode(col("subtaskaenglish")))
train_key_taskA = train_key.select(explode(col("subtaskaenglish")))

In [8]:
### DATA CLEANING  ##

def clean(df):

    def replace_url(text):
        return re.sub(r'https?:\/\/.*[\r\n]*', 'url_url_url', text, flags=re.MULTILINE)

    replace_url_udf = udf(replace_url, StringType())

    df = df.withColumn('cleaned_text', replace_url_udf(col('text')))

    ### REMOVE @
    
    df = df.withColumn('cleaned_text', regexp_replace(col('cleaned_text'), r'(@([A-Za-z0-9]+))', ''))

    return df

In [9]:
words_dict = dict(
      has_belief_words = set("assume believe apparent per-haps suspect think thought consider".split()),
      has_report_words = set("evidence source official footage capture assert told claim according".split()),
      has_doubt_words = set("wonder allege unsure guess speculate doubt".split()),
      has_knowledge = set("confirm definitely admit".split()),
      has_denial_words = set("refuse reject rebuff dismiss contradict oppose".split()),
      has_curse_words = set("lol rofl lmfao yeah stfu aha wtf shit".split()),
      has_question_words = set("when which what who how whom why whose".split()),
      has_other_words = set("irresponsible careless liar false witness untrue neglect integrity murder fake".split())
)

In [10]:
## FEATURE EXTRACTION ##

def extract_features(df):
    hasqmark = udf(lambda x: int('?' in x), IntegerType())
    df = df.withColumn('hasqmark', hasqmark(col('cleaned_text')))

    hasmark = udf(lambda x: int('!' in x), IntegerType())
    df = df.withColumn('hasmark', hasmark(col('cleaned_text')))

    hasperiod = udf(lambda x: int('.' in x), IntegerType())
    df = df.withColumn('hasperiod', hasperiod(col('cleaned_text')))

    df = df.withColumn('hashtags_count', expr('size(entities.hashtags)'))

    df = df.withColumn('mentions_count', expr('size(entities.user_mentions)'))

    df = df.withColumn('hasurls', expr('cast(size(entities.urls) >= 1 AS int)'))

    df = df.withColumn('hasmedia', expr('cast(size(entities.media) >= 1 AS int)'))

    df = df.withColumn('friends_count', expr('user.friends_count'))

    df = df.withColumn('followers_count', expr('user.followers_count'))

    ratiocapital = udf(lambda x: sum(map(str.isupper, x))/(len(x)+1), FloatType())
    df = df.withColumn('ratiocapital', ratiocapital(col('cleaned_text')))

    charlen = udf(lambda x: len(x), IntegerType())
    df = df.withColumn('charlen', charlen(col('cleaned_text')))

    df = df.withColumn('issource', expr('CAST((in_reply_to_status_id IS NULL) AS INT)'))

    ## TOKENIZATION ##

    tokenizer = Tokenizer(inputCol="cleaned_text", outputCol="words")
    temp_df = tokenizer.transform(df)

    #TODO remove stop words?

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    df = remover.transform(temp_df)

    hashingTF = HashingTF(inputCol="filtered", outputCol="rawhashtf", numFeatures=100)
    df = hashingTF.transform(df)

    idf = IDF(inputCol="rawhashtf", outputCol="hashtf")
    idfModel = idf.fit(df)
    df = idfModel.transform(df)

    ## TOKEN FEATURE EXTRACTION ##
    wordlen = udf(lambda words: len(words), IntegerType())
    df = df.withColumn('wordlen', wordlen(col('words')))

    def contains(y, x):
      return int(bool(len(y.intersection(set(x)))))

    for name, ys in words_dict.items():
        df = df.withColumn(name, udf(partial(contains, ys), IntegerType())(col('words')))

    #TODO negation words etc.
    negationwords = ['not', 'no', 'nobody', 'nothing', 'none', 'never',
              'neither', 'nor', 'nowhere', 'hardly', 'scarcely',
              'barely', 'don', 'isn', 'wasn', 'shouldn', 'wouldn',
              'couldn', 'doesn']
    def negacount(words):
      c = 0
      for negationword in negationwords:
        if negationword in words:
          c += 1
      return c
    negationcount = udf(negacount, IntegerType())
    df = df.withColumn('hasnegation', negationcount(col('words')))

    @udf('float')
    def count_upper(x):
        a = x.split()
        return sum(map(str.isupper, a))/(len(a) + 1)

    df = df.withColumn('allcapsratio', count_upper(col('cleaned_text')))

    return df

In [11]:
source_preprocessed = extract_features(clean(source_tweets_df))
reply_preprocessed = extract_features(clean(reply_tweets_df))
print(source_preprocessed.count())
print(reply_preprocessed.count())

325
5243


In [12]:
all_features = """hasmark hasqmark hasperiod hashtags_count mentions_count hasurls hasmedia
ratiocapital charlen issource wordlen hasnegation allcapsratio hashtf
favorite_count friends_count followers_count""".split() + list(words_dict.keys())
reply_features = []

In [13]:
train_tweets = reply_preprocessed.select(['id'] + all_features + reply_features).union(
        source_preprocessed.select(['id'] + all_features + reply_features)
    )
train_all = train_key_taskA.withColumnRenamed('key', 'id').withColumnRenamed('value', 'label').join(train_tweets, 'id'
)

dev_tweets =   reply_preprocessed.select(['id'] + all_features + reply_features).union(
       source_preprocessed.select(['id'] + all_features + reply_features) 
)

dev = dev_key_taskA.withColumnRenamed('key', 'id').withColumnRenamed('value', 'label').join( dev_tweets, 'id'
)

In [14]:
print(train_key_taskA.count())
print(train_tweets.count())
print(train_all.count())
print(dev_key_taskA.count())
print(dev_tweets.count())
print(dev.count())

5217
5568
4519
1485
5568
1049


In [15]:
# ML TEST

from sklearn.metrics import classification_report, confusion_matrix
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

In [16]:
inputCols = """hasmark hasqmark hasperiod hashtags_count mentions_count hasurls
ratiocapital charlen issource wordlen hasnegation allcapsratio""".split() + list(words_dict.keys())

assembler = VectorAssembler(inputCols=inputCols,outputCol="features")
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=False)
indexer = StringIndexer(inputCol="label", outputCol="label_index")
pipeline = Pipeline(stages=[assembler, scaler, indexer])

processor = pipeline.fit(train_all)

temp = processor.transform(train_all)
train_all_features_df = temp.select(['features', 'label_index'])

temp = processor.transform(dev)
dev_features_df = temp.select(['features', 'label_index'])

In [17]:
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator, TrainValidationSplitModel
lr = LogisticRegression(labelCol='label_index', maxIter=10)
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.01, 0.001]) \
    .addGrid(lr.fitIntercept, [False, True])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
paramGrid = paramGrid.build()

In [18]:
evaluator = MulticlassClassificationEvaluator(metricName="f1", labelCol='label_index')
# tvs = TrainValidationSplit(estimator=lr,
#                            estimatorParamMaps=paramGrid,
#                            evaluator=evaluator,
#                            # 80% of the data will be used for training, 20% for validation.
#                            trainRatio=0.8)

tvs = CrossValidator(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=evaluator,
                           # 80% of the data will be used for training, 20% for validation.
                           numFolds=3)
# tvs = LogisticRegression(labelCol='label_index', maxIter=10, regParam=0.001)

In [19]:
# train model
# model = trainer.fit(train_all_features_df, )
model = tvs.fit(train_all_features_df)

In [20]:
# # compute f1 on the dev set
# result = model.transform(dev_features_df)
# predictionAndLabels = result.select("prediction", "label_index")

# print("Test set f1 = " + str(evaluator.evaluate(predictionAndLabels)))

In [22]:
result = model.transform(dev_features_df)
y_true = result.select(['label_index']).collect()
y_pred = result.select(['prediction']).collect()
print(classification_report(y_true, y_pred, digits=3))

              precision    recall  f1-score   support

         0.0      0.799     0.955     0.870       778
         1.0      0.683     0.298     0.415        94
         2.0      0.615     0.453     0.522       106
         3.0      0.000     0.000     0.000        71

    accuracy                          0.781      1049
   macro avg      0.524     0.426     0.452      1049
weighted avg      0.716     0.781     0.735      1049



  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
best_params = {param[0].name: param[1] for param in model.bestModel.extractParamMap().items()}
best_lr = LogisticRegression(**best_params)
best_model = best_lr.fit(train_all_features_df)

In [25]:
result = best_model.transform(dev_features_df)
y_true = result.select(['label_index']).collect()
y_pred = result.select(['prediction']).collect()
print(classification_report(y_true, y_pred, digits=3))

              precision    recall  f1-score   support

         0.0      0.799     0.955     0.870       778
         1.0      0.683     0.298     0.415        94
         2.0      0.615     0.453     0.522       106
         3.0      0.000     0.000     0.000        71

    accuracy                          0.781      1049
   macro avg      0.524     0.426     0.452      1049
weighted avg      0.716     0.781     0.735      1049



  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
# model.write().overwrite().save('subtaskA_model')
all_pipeline = Pipeline(stages=[processor, best_model])
!rm -rf subtaskA_model/
all_pipeline.write().overwrite().save('subtaskA_model')

In [27]:
!zip -rq subtaskA_model subtaskA_model/