### Abhishek Kumar <br>
### Primary Dataset

In [1]:
import sklearn
from sklearn.metrics import cohen_kappa_score
import pandas as pd

##### For a given topic dataset CSV, create three separate CSVs, corresponding to lockdowns, masking and distancing, and vaccination. The new CSVs are written with the name of the topic suffixed to the original filename.

```
import argparse
import re
import pandas as pd

parser = argparse.ArgumentParser()
parser.add_argument("--infile", "-i", type=str, help="CSV for dataset to expand",
                    required=True)
args = parser.parse_args()

for topic in ["lockdowns", "masking and distancing", "vaccination"]:
    df = pd.read_csv(args.infile)
    cols = df.columns
    for col in cols:
        if "annotation" in col:
            df[col] = (df[col].notna() & df[col].str.contains(topic))
    new_fn = re.sub(r"\.csv$", f"_{topic.replace(' ', '_')}.csv", args.infile)
    df.to_csv(new_fn, index=False)

```

###### Using command: python expand_topic_csv_dataset.py --infile twitter_topic_0.csv
###### Using command: python expand_topic_csv_dataset.py --infile twitter_topic_1.csv
###### Using command: python expand_topic_csv_dataset.py --infile twitter_topic_2.csv
###### Using command: python expand_topic_csv_dataset.py --infile twitter_topic_3.csv



## twitter_topic_0_lockdowns.csv

In [2]:
df = pd.read_csv(r'twitter_topic\twitter_topic_0_lockdowns.csv')
df.head()

Unnamed: 0,text,annotation_104,annotation_101,annotation_102,annotation_103
0,Putin After Announcing #CovidVaccine #Russian ...,False,False,False,False
1,Courtesy: WA! #WhatsApp #COVID #CovidVaccine h...,False,False,False,False
2,4 of the vaccines Jared bought are expected to...,False,False,False,False
3,One day you will realize CDC Guidelines magica...,False,False,False,False
4,Im far from lying. Current CDC guidelines is ...,False,False,False,False


In [3]:
print(df['annotation_101'].isnull().sum())
print(df['annotation_102'].isnull().sum())
print(df['annotation_103'].isnull().sum())
print(df['annotation_104'].isnull().sum())
print(df['text'].isnull().sum())

0
0
0
0
0


In [4]:
annotation_1= df['annotation_101']
annotation_2= df['annotation_102']
annotation_3= df['annotation_103']
annotation_4= df['annotation_104']

### Cohen_kappa average score of annotation 101

In [5]:
def cohen_avg_1():
    cohen_score_12 = cohen_kappa_score(annotation_1,annotation_2)
    cohen_score_13 = cohen_kappa_score(annotation_1,annotation_3)
    cohen_score_14 = cohen_kappa_score(annotation_1,annotation_4)
    cohen_score_average_1 = round((cohen_score_12 + cohen_score_13 + cohen_score_14)/3,3)
    #print("Average Cohen Kappa Score for annotation 101:",(cohen_score_average_1))
    return cohen_score_average_1

In [6]:
cohen_avg_1()

0.208

### Cohen_kappa average score of annotation 102

In [7]:
def cohen_avg_2():
    cohen_score_21 = cohen_kappa_score(annotation_2,annotation_1)
    cohen_score_23 = cohen_kappa_score(annotation_2,annotation_3)
    cohen_score_24 = cohen_kappa_score(annotation_2,annotation_4)
    cohen_score_average_2 = round((cohen_score_21 + cohen_score_23 + cohen_score_24)/3,3)
    #print("Average Cohen Kappa Score for annotation 102:",(cohen_score_average_2))
    return cohen_score_average_2

In [8]:
cohen_avg_2()

0.17

### Cohen_kappa average score of annotation 103

In [9]:
def cohen_avg_3():
    cohen_score_31 = cohen_kappa_score(annotation_3,annotation_1)
    cohen_score_32 = cohen_kappa_score(annotation_3,annotation_2)
    cohen_score_34 = cohen_kappa_score(annotation_3,annotation_4)
    cohen_score_average_3 = round((cohen_score_31 + cohen_score_32 + cohen_score_34)/3,3)
    #print("Average Cohen Kappa Score for annotation 103:",(cohen_score_average_3))
    return cohen_score_average_3

In [10]:
cohen_avg_3()

0.21

### Cohen_kappa average score of annotation 104

In [11]:
def cohen_avg_4():
    cohen_score_41 = cohen_kappa_score(annotation_4,annotation_1)
    cohen_score_42 = cohen_kappa_score(annotation_4,annotation_2)
    cohen_score_43 = cohen_kappa_score(annotation_4,annotation_3)
    cohen_score_average_4 = round((cohen_score_41 + cohen_score_42 + cohen_score_43)/3,3)
    #print("Average Cohen Kappa Score for annotation 104:",(cohen_score_average_4))
    return cohen_score_average_4

In [12]:
cohen_avg_4()

0.153

##### If average cohen kappa score for any annotation is less than 0.2, we will drop it

In [13]:
def drop_annotate():
    average = [cohen_avg_1(), cohen_avg_2(),cohen_avg_3(),cohen_avg_4()]
    columns_name = ['annotation_101','annotation_102','annotation_103','annotation_104']
    #columns =[annotation_1,annotation_2,annotation_3,annotation_4]
    for i in range(0,len(average)):
        if average[i]< 0.2:
            print(columns_name[i])
            print(average[i])
            df.drop(columns_name[i], axis=1, inplace = True)
    print(df.head())
    r = []
    for col in df.columns:
        r.append(col)
    print(r[1:])
    


In [14]:
drop_annotate()

annotation_102
0.17
annotation_104
0.153
                                                text  annotation_101  \
0  Putin After Announcing #CovidVaccine #Russian ...           False   
1  Courtesy: WA! #WhatsApp #COVID #CovidVaccine h...           False   
2  4 of the vaccines Jared bought are expected to...           False   
3  One day you will realize CDC Guidelines magica...           False   
4  Im far from lying.  Current CDC guidelines is ...           False   

   annotation_103  
0           False  
1           False  
2           False  
3           False  
4           False  
['annotation_101', 'annotation_103']


In [15]:
def updateLabel(df):
    for index, rows in df.iterrows():
        match = [[cohen_avg_1(),rows['annotation_101']],
                       [cohen_avg_3(),rows['annotation_103']]]
        boolean = [rows['annotation_101'],rows['annotation_103']]
        match.sort(key = lambda k:k[0])
        true = match.count('True')
        false = match.count('False')
        if true == false: 
            kapaValue, colValue = match[-1]
            df.loc[index, 'Final Label'] = colValue
            
        elif true!=false:
            if true > false:
                df.loc[index, 'Final Label'] = True
            else:
                df.loc[index, 'Final Label'] = False


In [16]:
updateLabel(df)
df = df.iloc[:,[0,-1]]
df.head()

Unnamed: 0,text,Final Label
0,Putin After Announcing #CovidVaccine #Russian ...,False
1,Courtesy: WA! #WhatsApp #COVID #CovidVaccine h...,False
2,4 of the vaccines Jared bought are expected to...,False
3,One day you will realize CDC Guidelines magica...,False
4,Im far from lying. Current CDC guidelines is ...,False


In [17]:
df.to_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_0_lockdowns.csv',
           index = False)

## twitter_topic_0_masking_and_distancing.csv

In [18]:
df_1 = pd.read_csv(r'twitter_topic\twitter_topic_0_masking_and_distancing.csv')
df_1.head()



Unnamed: 0,text,annotation_104,annotation_101,annotation_102,annotation_103
0,Putin After Announcing #CovidVaccine #Russian ...,False,False,False,False
1,Courtesy: WA! #WhatsApp #COVID #CovidVaccine h...,False,False,False,False
2,4 of the vaccines Jared bought are expected to...,False,False,False,False
3,One day you will realize CDC Guidelines magica...,True,False,False,False
4,Im far from lying. Current CDC guidelines is ...,True,True,False,True


In [19]:
print(df_1['annotation_101'].isnull().sum())
print(df_1['annotation_102'].isnull().sum())
print(df_1['annotation_103'].isnull().sum())
print(df_1['annotation_104'].isnull().sum())
print(df_1['text'].isnull().sum())

0
0
0
0
0


In [20]:
annotation_1= df_1['annotation_101']
annotation_2= df_1['annotation_102']
annotation_3= df_1['annotation_103']
annotation_4= df_1['annotation_104']

### Cohen_kappa average score of annotation 101

In [21]:
def cohen_avg_1():
    cohen_score_12 = cohen_kappa_score(annotation_1,annotation_2)
    cohen_score_13 = cohen_kappa_score(annotation_1,annotation_3)
    cohen_score_14 = cohen_kappa_score(annotation_1,annotation_4)
    cohen_score_average_1 = round((cohen_score_12 + cohen_score_13 + cohen_score_14)/3,3)
    #print("Average Cohen Kappa Score for annotation 101:",(cohen_score_average_1))
    return cohen_score_average_1

In [22]:
cohen_avg_1()

0.77

### Cohen_kappa average score of annotation 102

In [23]:
def cohen_avg_2():
    cohen_score_21 = cohen_kappa_score(annotation_2,annotation_1)
    cohen_score_23 = cohen_kappa_score(annotation_2,annotation_3)
    cohen_score_24 = cohen_kappa_score(annotation_2,annotation_4)
    cohen_score_average_2 = round((cohen_score_21 + cohen_score_23 + cohen_score_24)/3,3)
    #print("Average Cohen Kappa Score for annotation 102:",(cohen_score_average_2))
    return cohen_score_average_2

In [24]:
cohen_avg_2()

0.733

### Cohen_kappa average score of annotation 103

In [25]:
def cohen_avg_3():
    cohen_score_31 = cohen_kappa_score(annotation_3,annotation_1)
    cohen_score_32 = cohen_kappa_score(annotation_3,annotation_2)
    cohen_score_34 = cohen_kappa_score(annotation_3,annotation_4)
    cohen_score_average_3 = round((cohen_score_31 + cohen_score_32 + cohen_score_34)/3,3)
    #print("Average Cohen Kappa Score for annotation 103:",(cohen_score_average_3))
    return cohen_score_average_3

In [26]:
cohen_avg_3()

0.763

### Cohen_kappa average score of annotation 104

In [27]:
def cohen_avg_4():
    cohen_score_41 = cohen_kappa_score(annotation_4,annotation_1)
    cohen_score_42 = cohen_kappa_score(annotation_4,annotation_2)
    cohen_score_43 = cohen_kappa_score(annotation_4,annotation_3)
    cohen_score_average_4 = round((cohen_score_41 + cohen_score_42 + cohen_score_43)/3,3)
    #print("Average Cohen Kappa Score for annotation 104:",(cohen_score_average_4))
    return cohen_score_average_4

In [28]:
cohen_avg_4()

0.511

##### If average cohen kappa score for any annotation is less than 0.2, we will drop it

In [29]:
def drop_annotate():
    average = [cohen_avg_1(), cohen_avg_2(),cohen_avg_3(),cohen_avg_4()]
    columns_name = ['annotation_101','annotation_102','annotation_103','annotation_104']
    for i in range(0,len(average)):
        if average[i]< 0.2:
            print(columns_name[i])
            print(average[i])
            df_1.drop(columns_name[i], axis=1, inplace = True)
    print(df_1.head())
    r = []
    for col in df_1.columns:
        r.append(col)
    print(r[1:])

In [30]:
drop_annotate()

                                                text  annotation_104  \
0  Putin After Announcing #CovidVaccine #Russian ...           False   
1  Courtesy: WA! #WhatsApp #COVID #CovidVaccine h...           False   
2  4 of the vaccines Jared bought are expected to...           False   
3  One day you will realize CDC Guidelines magica...            True   
4  Im far from lying.  Current CDC guidelines is ...            True   

   annotation_101  annotation_102  annotation_103  
0           False           False           False  
1           False           False           False  
2           False           False           False  
3           False           False           False  
4            True           False            True  
['annotation_104', 'annotation_101', 'annotation_102', 'annotation_103']


In [31]:
def updateLabel(df_1):
    for index, rows in df_1.iterrows():
        match = [[cohen_avg_1(),rows['annotation_101']],
                   [cohen_avg_2(),rows['annotation_102']],
                   [cohen_avg_3(),rows['annotation_103']],
                    [cohen_avg_4(),rows['annotation_104']]]
        boolean = [rows['annotation_101'],rows['annotation_102'],rows['annotation_103'],rows['annotation_104']]
        match.sort(key = lambda k:k[0])
        true = match.count('True')
        false = match.count('False')
        if true == false: 
            kapaValue, colValue = match[-1]
            df_1.loc[index, 'Final Label'] = colValue
            
        elif true!=false:
            if true > false:
                df_1.loc[index, 'Final Label'] = True
            else:
                df_1.loc[index, 'Final Label'] = False
    



In [32]:
updateLabel(df_1)
df_1 = df_1.iloc[:,[0,-1]]
df_1.head()

Unnamed: 0,text,Final Label
0,Putin After Announcing #CovidVaccine #Russian ...,False
1,Courtesy: WA! #WhatsApp #COVID #CovidVaccine h...,False
2,4 of the vaccines Jared bought are expected to...,False
3,One day you will realize CDC Guidelines magica...,False
4,Im far from lying. Current CDC guidelines is ...,True


In [33]:
df_1.to_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_0_masking_and_distancing.csv',
           index = False)

## twitter_topic_0_vaccination.csv

In [34]:
df_2 = pd.read_csv(r'twitter_topic\twitter_topic_0_vaccination.csv')
df_2.head()

Unnamed: 0,text,annotation_104,annotation_101,annotation_102,annotation_103
0,Putin After Announcing #CovidVaccine #Russian ...,True,True,True,False
1,Courtesy: WA! #WhatsApp #COVID #CovidVaccine h...,True,True,False,False
2,4 of the vaccines Jared bought are expected to...,True,True,True,True
3,One day you will realize CDC Guidelines magica...,False,False,False,False
4,Im far from lying. Current CDC guidelines is ...,False,True,True,True


In [35]:
print(df_2['annotation_101'].isnull().sum())
print(df_2['annotation_102'].isnull().sum())
print(df_2['annotation_103'].isnull().sum())
print(df_2['annotation_104'].isnull().sum())
print(df_2['text'].isnull().sum())

0
0
0
0
0


In [36]:
annotation_1= df_2['annotation_101']
annotation_2= df_2['annotation_102']
annotation_3= df_2['annotation_103']
annotation_4= df_2['annotation_104']

### Cohen_kappa average score of annotation 101

In [37]:
def cohen_avg_1():
    cohen_score_12 = cohen_kappa_score(annotation_1,annotation_2)
    cohen_score_13 = cohen_kappa_score(annotation_1,annotation_3)
    cohen_score_14 = cohen_kappa_score(annotation_1,annotation_4)
    cohen_score_average_1 = round((cohen_score_12 + cohen_score_13 + cohen_score_14)/3,3)
    #print("Average Cohen Kappa Score for annotation 101:",(cohen_score_average_1))
    return cohen_score_average_1

In [38]:
cohen_avg_1()

0.762

### Cohen_kappa average score of annotation 102

In [39]:
def cohen_avg_2():
    cohen_score_21 = cohen_kappa_score(annotation_2,annotation_1)
    cohen_score_23 = cohen_kappa_score(annotation_2,annotation_3)
    cohen_score_24 = cohen_kappa_score(annotation_2,annotation_4)
    cohen_score_average_2 = round((cohen_score_21 + cohen_score_23 + cohen_score_24)/3,3)
    #print("Average Cohen Kappa Score for annotation 102:",(cohen_score_average_2))
    return cohen_score_average_2

In [40]:
cohen_avg_2()

0.624

### Cohen_kappa average score of annotation 103

In [41]:
def cohen_avg_3():
    cohen_score_31 = cohen_kappa_score(annotation_3,annotation_1)
    cohen_score_32 = cohen_kappa_score(annotation_3,annotation_2)
    cohen_score_34 = cohen_kappa_score(annotation_3,annotation_4)
    cohen_score_average_3 = round((cohen_score_31 + cohen_score_32 + cohen_score_34)/3,3)
    #print("Average Cohen Kappa Score for annotation 103:",(cohen_score_average_3))
    return cohen_score_average_3

In [42]:
cohen_avg_3()

0.71

### Cohen_kappa average score of annotation 104

In [43]:
def cohen_avg_4():
    cohen_score_41 = cohen_kappa_score(annotation_4,annotation_1)
    cohen_score_42 = cohen_kappa_score(annotation_4,annotation_2)
    cohen_score_43 = cohen_kappa_score(annotation_4,annotation_3)
    cohen_score_average_4 = round((cohen_score_41 + cohen_score_42 + cohen_score_43)/3,3)
    #print("Average Cohen Kappa Score for annotation 104:",(cohen_score_average_4))
    return cohen_score_average_4

In [44]:
cohen_avg_4()

0.673

##### If average cohen kappa score for any annotation is less than 0.2, we will drop it

In [45]:
def drop_annotate():
    average = [cohen_avg_1(), cohen_avg_2(),cohen_avg_3(),cohen_avg_4()]
    columns_name = ['annotation_101','annotation_102','annotation_103','annotation_104']
    for i in range(0,len(average)):
        if average[i]< 0.2:
            print(columns_name[i])
            print(average[i])
            df_2.drop(columns_name[i], axis=1, inplace = True)
    print(df_2.head())
    r = []
    for col in df_2.columns:
        r.append(col)
    print(r[1:])

In [46]:
drop_annotate()

                                                text  annotation_104  \
0  Putin After Announcing #CovidVaccine #Russian ...            True   
1  Courtesy: WA! #WhatsApp #COVID #CovidVaccine h...            True   
2  4 of the vaccines Jared bought are expected to...            True   
3  One day you will realize CDC Guidelines magica...           False   
4  Im far from lying.  Current CDC guidelines is ...           False   

   annotation_101  annotation_102  annotation_103  
0            True            True           False  
1            True           False           False  
2            True            True            True  
3           False           False           False  
4            True            True            True  
['annotation_104', 'annotation_101', 'annotation_102', 'annotation_103']


In [47]:
def updateLabel(df_2):
    for index, rows in df_2.iterrows():
        match = [[cohen_avg_1(),rows['annotation_101']],
                   [cohen_avg_2(),rows['annotation_102']],
                   [cohen_avg_3(),rows['annotation_103']],
                    [cohen_avg_4(),rows['annotation_104']]]
        boolean = [rows['annotation_101'],rows['annotation_102'],rows['annotation_103'],rows['annotation_104']]
        match.sort(key = lambda k:k[0])
        true = match.count('True')
        false = match.count('False')
        if true == false: 
            kapaValue, colValue = match[-1]
            df_2.loc[index, 'Final Label'] = colValue
        elif true!=false:
            if true > false:
                df_1.loc[index, 'Final Label'] = True
            else:
                df_1.loc[index, 'Final Label'] = False
    



In [48]:
updateLabel(df_2)
df_2 = df_2.iloc[:,[0,-1]]
df_2.head()

Unnamed: 0,text,Final Label
0,Putin After Announcing #CovidVaccine #Russian ...,True
1,Courtesy: WA! #WhatsApp #COVID #CovidVaccine h...,True
2,4 of the vaccines Jared bought are expected to...,True
3,One day you will realize CDC Guidelines magica...,False
4,Im far from lying. Current CDC guidelines is ...,True


In [49]:
df_2.to_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_0_vaccination.csv',
           index = False)

## twitter_topic_1_lockdowns.csv

In [50]:
df_3 = pd.read_csv(r'twitter_topic\twitter_topic_1_lockdowns.csv')
df_3.head()

Unnamed: 0,text,annotation_51,annotation_52,annotation_53,annotation_54
0,Follow the CDC guidelines. Don’t become a stat...,False,False,False,False
1,Do you agree with CDC guidelines that children...,False,False,False,False
2,"So, both #Pharmaceutical companies #lilly and ...",False,False,False,False
3,The CDC's guidelines are clear; you just don't...,False,False,False,False
4,CDC Updates School Guidelines For Students Ret...,True,True,True,False


In [51]:
print(df_3['annotation_51'].isnull().sum())
print(df_3['annotation_52'].isnull().sum())
print(df_3['annotation_53'].isnull().sum())
print(df_3['annotation_54'].isnull().sum())
print(df_3['text'].isnull().sum())

0
0
0
0
0


In [52]:
annotation_1= df_3['annotation_51']
annotation_2= df_3['annotation_52']
annotation_3= df_3['annotation_53']
annotation_4= df_3['annotation_54']

### Cohen_kappa average score of annotation 51

In [53]:
def cohen_avg_1():
    cohen_score_12 = cohen_kappa_score(annotation_1,annotation_2)
    cohen_score_13 = cohen_kappa_score(annotation_1,annotation_3)
    cohen_score_14 = cohen_kappa_score(annotation_1,annotation_4)
    cohen_score_average_1 = round((cohen_score_12 + cohen_score_13 + cohen_score_14)/3,3)
    #print("Average Cohen Kappa Score for annotation 51:",(cohen_score_average_1))
    return cohen_score_average_1

In [54]:
cohen_avg_1()

0.384

### Cohen_kappa average score of annotation 52

In [55]:
def cohen_avg_2():
    cohen_score_21 = cohen_kappa_score(annotation_2,annotation_1)
    cohen_score_23 = cohen_kappa_score(annotation_2,annotation_3)
    cohen_score_24 = cohen_kappa_score(annotation_2,annotation_4)
    cohen_score_average_2 = round((cohen_score_21 + cohen_score_23 + cohen_score_24)/3,3)
    #print("Average Cohen Kappa Score for annotation 52:",(cohen_score_average_2))
    return cohen_score_average_2

In [56]:
cohen_avg_2()

0.248

### Cohen_kappa average score of annotation 53

In [57]:
def cohen_avg_3():
    cohen_score_31 = cohen_kappa_score(annotation_3,annotation_1)
    cohen_score_32 = cohen_kappa_score(annotation_3,annotation_2)
    cohen_score_34 = cohen_kappa_score(annotation_3,annotation_4)
    cohen_score_average_3 = round((cohen_score_31 + cohen_score_32 + cohen_score_34)/3,3)
    #print("Average Cohen Kappa Score for annotation 53:",(cohen_score_average_3))
    return cohen_score_average_3

In [58]:
cohen_avg_3()

0.347

### Cohen_kappa average score of annotation 54

In [59]:
def cohen_avg_4():
    cohen_score_41 = cohen_kappa_score(annotation_4,annotation_1)
    cohen_score_42 = cohen_kappa_score(annotation_4,annotation_2)
    cohen_score_43 = cohen_kappa_score(annotation_4,annotation_3)
    cohen_score_average_4 = round((cohen_score_41 + cohen_score_42 + cohen_score_43)/3,3)
    #print("Average Cohen Kappa Score for annotation 54:",(cohen_score_average_4))
    return cohen_score_average_4

In [60]:
cohen_avg_4()

0.362

##### If average cohen kappa score for any annotation is less than 0.2, we will drop it

In [61]:
def drop_annotate():
    average = [cohen_avg_1(), cohen_avg_2(),cohen_avg_3(),cohen_avg_4()]
    columns_name = ['annotation_51','annotation_52','annotation_53','annotation_54']
    for i in range(0,len(average)):
        if average[i]< 0.2:
            print(columns_name[i])
            print(average[i])
            df_3.drop(columns_name[i], axis=1, inplace = True)
    print(df_3.head())
    r = []
    for col in df_3.columns:
        r.append(col)
    print(r[1:])

In [62]:
drop_annotate()

                                                text  annotation_51  \
0  Follow the CDC guidelines. Don’t become a stat...          False   
1  Do you agree with CDC guidelines that children...          False   
2  So, both #Pharmaceutical companies #lilly and ...          False   
3  The CDC's guidelines are clear; you just don't...          False   
4  CDC Updates School Guidelines For Students Ret...           True   

   annotation_52  annotation_53  annotation_54  
0          False          False          False  
1          False          False          False  
2          False          False          False  
3          False          False          False  
4           True           True          False  
['annotation_51', 'annotation_52', 'annotation_53', 'annotation_54']


In [63]:
def updateLabel(df_3):
    for index, rows in df_3.iterrows():
        match = [[cohen_avg_1(),rows['annotation_51']],
                   [cohen_avg_2(),rows['annotation_52']],
                   [cohen_avg_3(),rows['annotation_53']],
                    [cohen_avg_4(),rows['annotation_54']]]
        boolean = [rows['annotation_51'],rows['annotation_52'],rows['annotation_53'],rows['annotation_54']]
        match.sort(key = lambda k:k[0])
        true = match.count('True')
        false = match.count('False')
        if true == false: 
            kapaValue, colValue = match[-1]
            df_3.loc[index, 'Final Label'] = colValue
            
        elif true != false:
            if true > false:
                df_3.loc[index, 'Final Label'] = True
            else:
                df_3.loc[index, 'Final Label'] = False
    



In [64]:
updateLabel(df_3)
df_3 = df_3.iloc[:,[0,-1]]
df_3.head()

Unnamed: 0,text,Final Label
0,Follow the CDC guidelines. Don’t become a stat...,False
1,Do you agree with CDC guidelines that children...,False
2,"So, both #Pharmaceutical companies #lilly and ...",False
3,The CDC's guidelines are clear; you just don't...,False
4,CDC Updates School Guidelines For Students Ret...,True


In [65]:
df_3.to_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_1_lockdowns.csv',
           index = False)

## twitter_topic_1_masking_and_distancing.csv

In [66]:
df_4 = pd.read_csv(r'twitter_topic\twitter_topic_1_masking_and_distancing.csv')
df_4.head()

Unnamed: 0,text,annotation_51,annotation_52,annotation_53,annotation_54
0,Follow the CDC guidelines. Don’t become a stat...,True,False,True,False
1,Do you agree with CDC guidelines that children...,True,True,True,True
2,"So, both #Pharmaceutical companies #lilly and ...",False,False,False,False
3,The CDC's guidelines are clear; you just don't...,True,False,False,False
4,CDC Updates School Guidelines For Students Ret...,True,False,False,False


In [67]:
print(df_4['annotation_51'].isnull().sum())
print(df_4['annotation_52'].isnull().sum())
print(df_4['annotation_53'].isnull().sum())
print(df_4['annotation_54'].isnull().sum())
print(df_4['text'].isnull().sum())

0
0
0
0
0


In [68]:
annotation_1= df_4['annotation_51']
annotation_2= df_4['annotation_52']
annotation_3= df_4['annotation_53']
annotation_4= df_4['annotation_54']

### Cohen_kappa average score of annotation 51

In [69]:
def cohen_avg_1():
    cohen_score_12 = cohen_kappa_score(annotation_1,annotation_2)
    cohen_score_13 = cohen_kappa_score(annotation_1,annotation_3)
    cohen_score_14 = cohen_kappa_score(annotation_1,annotation_4)
    cohen_score_average_1 = round((cohen_score_12 + cohen_score_13 + cohen_score_14)/3,3)
    #print("Average Cohen Kappa Score for annotation 51:",(cohen_score_average_1))
    return cohen_score_average_1

In [70]:
cohen_avg_1()

0.555

### Cohen_kappa average score of annotation 52

In [71]:
def cohen_avg_2():
    cohen_score_21 = cohen_kappa_score(annotation_2,annotation_1)
    cohen_score_23 = cohen_kappa_score(annotation_2,annotation_3)
    cohen_score_24 = cohen_kappa_score(annotation_2,annotation_4)
    cohen_score_average_2 = round((cohen_score_21 + cohen_score_23 + cohen_score_24)/3,3)
    #print("Average Cohen Kappa Score for annotation 52:",(cohen_score_average_2))
    return cohen_score_average_2

In [72]:
cohen_avg_2()

0.699

### Cohen_kappa average score of annotation 53

In [73]:
def cohen_avg_3():
    cohen_score_31 = cohen_kappa_score(annotation_3,annotation_1)
    cohen_score_32 = cohen_kappa_score(annotation_3,annotation_2)
    cohen_score_34 = cohen_kappa_score(annotation_3,annotation_4)
    cohen_score_average_3 = round((cohen_score_31 + cohen_score_32 + cohen_score_34)/3,3)
    #print("Average Cohen Kappa Score for annotation 53:",(cohen_score_average_3))
    return cohen_score_average_3

In [74]:
cohen_avg_3()

0.681

### Cohen_kappa average score of annotation 54

In [75]:
def cohen_avg_4():
    cohen_score_41 = cohen_kappa_score(annotation_4,annotation_1)
    cohen_score_42 = cohen_kappa_score(annotation_4,annotation_2)
    cohen_score_43 = cohen_kappa_score(annotation_4,annotation_3)
    cohen_score_average_4 = round((cohen_score_41 + cohen_score_42 + cohen_score_43)/3,3)
    #print("Average Cohen Kappa Score for annotation 54:",(cohen_score_average_4))
    return cohen_score_average_4

In [76]:
cohen_avg_4()

0.705

##### If average cohen kappa score for any annotation is less than 0.2, we will drop it

In [77]:
def drop_annotate():
    average = [cohen_avg_1(), cohen_avg_2(),cohen_avg_3(),cohen_avg_4()]
    columns_name = ['annotation_51','annotation_52','annotation_53','annotation_54']
    #columns =[annotation_1,annotation_2,annotation_3,annotation_4]
    for i in range(0,len(average)):
        if average[i]< 0.2:
            print(columns_name[i])
            print(average[i])
            df_4.drop(columns_name[i], axis=1, inplace = True)
    print(df_4.head())
    r = []
    for col in df_4.columns:
        r.append(col)
    print(r[1:])

In [78]:
drop_annotate()

                                                text  annotation_51  \
0  Follow the CDC guidelines. Don’t become a stat...           True   
1  Do you agree with CDC guidelines that children...           True   
2  So, both #Pharmaceutical companies #lilly and ...          False   
3  The CDC's guidelines are clear; you just don't...           True   
4  CDC Updates School Guidelines For Students Ret...           True   

   annotation_52  annotation_53  annotation_54  
0          False           True          False  
1           True           True           True  
2          False          False          False  
3          False          False          False  
4          False          False          False  
['annotation_51', 'annotation_52', 'annotation_53', 'annotation_54']


In [79]:
def updateLabel(df_4):
    for index, rows in df_4.iterrows():
        match = [[cohen_avg_1(),rows['annotation_51']],
                   [cohen_avg_2(),rows['annotation_52']],
                   [cohen_avg_3(),rows['annotation_53']],
                    [cohen_avg_4(),rows['annotation_54']]]
        boolean = [rows['annotation_51'],rows['annotation_52'],rows['annotation_53'],rows['annotation_54']]
        match.sort(key = lambda k:k[0])
        true = match.count('True')
        false = match.count('False')
        if true == false: 
            kapaValue, colValue = match[-1]
            df_4.loc[index, 'Final Label'] = colValue
            
        elif true!=false:
            if true > false:
                df_4.loc[index, 'Final Label'] = True
            else:
                df_4.loc[index, 'Final Label'] = False
    



In [80]:
updateLabel(df_4)
df_4 = df_4.iloc[:,[0,-1]]
df_4.head()

Unnamed: 0,text,Final Label
0,Follow the CDC guidelines. Don’t become a stat...,False
1,Do you agree with CDC guidelines that children...,True
2,"So, both #Pharmaceutical companies #lilly and ...",False
3,The CDC's guidelines are clear; you just don't...,False
4,CDC Updates School Guidelines For Students Ret...,False


In [81]:
df_4.to_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_1_masking_and_distancing.csv',
           index = False)

## twitter_topic_1_vaccination.csv

In [82]:
df_5 = pd.read_csv(r'twitter_topic\twitter_topic_1_vaccination.csv')
df_5.head()

Unnamed: 0,text,annotation_51,annotation_52,annotation_53,annotation_54
0,Follow the CDC guidelines. Don’t become a stat...,False,False,False,False
1,Do you agree with CDC guidelines that children...,False,False,False,False
2,"So, both #Pharmaceutical companies #lilly and ...",True,True,True,True
3,The CDC's guidelines are clear; you just don't...,False,False,False,False
4,CDC Updates School Guidelines For Students Ret...,False,False,False,False


In [83]:
print(df_5['annotation_51'].isnull().sum())
print(df_5['annotation_52'].isnull().sum())
print(df_5['annotation_53'].isnull().sum())
print(df_5['annotation_54'].isnull().sum())
print(df_5['text'].isnull().sum())

0
0
0
0
0


In [84]:
annotation_1= df_5['annotation_51']
annotation_2= df_5['annotation_52']
annotation_3= df_5['annotation_53']
annotation_4= df_5['annotation_54']

### Cohen_kappa average score of annotation 51

In [85]:
def cohen_avg_1():
    cohen_score_12 = cohen_kappa_score(annotation_1,annotation_2)
    cohen_score_13 = cohen_kappa_score(annotation_1,annotation_3)
    cohen_score_14 = cohen_kappa_score(annotation_1,annotation_4)
    cohen_score_average_1 = round((cohen_score_12 + cohen_score_13 + cohen_score_14)/3,3)
    #print("Average Cohen Kappa Score for annotation 51:",(cohen_score_average_1))
    return cohen_score_average_1

In [86]:
cohen_avg_1()

0.656

### Cohen_kappa average score of annotation 52

In [87]:
def cohen_avg_2():
    cohen_score_21 = cohen_kappa_score(annotation_2,annotation_1)
    cohen_score_23 = cohen_kappa_score(annotation_2,annotation_3)
    cohen_score_24 = cohen_kappa_score(annotation_2,annotation_4)
    cohen_score_average_2 = round((cohen_score_21 + cohen_score_23 + cohen_score_24)/3,3)
    #print("Average Cohen Kappa Score for annotation 52:",(cohen_score_average_2))
    return cohen_score_average_2

In [88]:
cohen_avg_2()

0.363

### Cohen_kappa average score of annotation 53

In [89]:
def cohen_avg_3():
    cohen_score_31 = cohen_kappa_score(annotation_3,annotation_1)
    cohen_score_32 = cohen_kappa_score(annotation_3,annotation_2)
    cohen_score_34 = cohen_kappa_score(annotation_3,annotation_4)
    cohen_score_average_3 = round((cohen_score_31 + cohen_score_32 + cohen_score_34)/3,3)
    #print("Average Cohen Kappa Score for annotation 53:",(cohen_score_average_3))
    return cohen_score_average_3

In [90]:
cohen_avg_3()

0.613

### Cohen_kappa average score of annotation 54

In [91]:
def cohen_avg_4():
    cohen_score_41 = cohen_kappa_score(annotation_4,annotation_1)
    cohen_score_42 = cohen_kappa_score(annotation_4,annotation_2)
    cohen_score_43 = cohen_kappa_score(annotation_4,annotation_3)
    cohen_score_average_4 = round((cohen_score_41 + cohen_score_42 + cohen_score_43)/3,3)
    #print("Average Cohen Kappa Score for annotation 54:",(cohen_score_average_4))
    return cohen_score_average_4

In [92]:
cohen_avg_4()

0.608

##### If average cohen kappa score for any annotation is less than 0.2, we will drop it

In [93]:
def drop_annotate():
    average = [cohen_avg_1(), cohen_avg_2(),cohen_avg_3(),cohen_avg_4()]
    columns_name = ['annotation_51','annotation_52','annotation_53','annotation_54']
    #columns =[annotation_1,annotation_2,annotation_3,annotation_4]
    for i in range(0,len(average)):
        if average[i]< 0.2:
            print(columns_name[i])
            print(average[i])
            df_5.drop(columns_name[i], axis=1, inplace = True)
    print(df_5.head())
    r = []
    for col in df_5.columns:
        r.append(col)
    print(r[1:])

In [94]:
drop_annotate()

                                                text  annotation_51  \
0  Follow the CDC guidelines. Don’t become a stat...          False   
1  Do you agree with CDC guidelines that children...          False   
2  So, both #Pharmaceutical companies #lilly and ...           True   
3  The CDC's guidelines are clear; you just don't...          False   
4  CDC Updates School Guidelines For Students Ret...          False   

   annotation_52  annotation_53  annotation_54  
0          False          False          False  
1          False          False          False  
2           True           True           True  
3          False          False          False  
4          False          False          False  
['annotation_51', 'annotation_52', 'annotation_53', 'annotation_54']


In [95]:
def updateLabel(df_5):
    for index, rows in df_5.iterrows():
        match = [[cohen_avg_1(),rows['annotation_51']],
                   [cohen_avg_2(),rows['annotation_52']],
                   [cohen_avg_3(),rows['annotation_53']],
                    [cohen_avg_4(),rows['annotation_54']]]
        boolean = [rows['annotation_51'],rows['annotation_52'],rows['annotation_53'],rows['annotation_54']]
        match.sort(key = lambda k:k[0])
        true = match.count('True')
        false = match.count('False')
        if true == false: 
            kapaValue, colValue = match[-1]
            df_5.loc[index, 'Final Label'] = colValue
            
        elif true!=false:
            if true > false:
                df_5.loc[index, 'Final Label'] = True
            else:
                df_5.loc[index, 'Final Label'] = False
    

In [96]:
updateLabel(df_5)
df_5 = df_5.iloc[:,[0,-1]]
df_5.head()

Unnamed: 0,text,Final Label
0,Follow the CDC guidelines. Don’t become a stat...,False
1,Do you agree with CDC guidelines that children...,False
2,"So, both #Pharmaceutical companies #lilly and ...",True
3,The CDC's guidelines are clear; you just don't...,False
4,CDC Updates School Guidelines For Students Ret...,False


In [97]:
df_5.to_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_1_vaccination.csv',
           index = False)

## twitter_topic_2_lockdowns.csv

In [98]:
df_6 = pd.read_csv(r'twitter_topic\twitter_topic_2_lockdowns.csv')
df_6.head()

Unnamed: 0,text,annotation_56,annotation_57,annotation_58,annotation_59
0,"Federal Judge Rules Against CDC, Throws Out Cr...",False,False,False,False
1,Indeed. Even in the dysfunctional US health sy...,False,False,False,False
2,a vaccine seems to be ESSENTIAL if we are to s...,False,False,False,False
3,Nurses union calls on CDC to reinstate univers...,False,False,False,False
4,Sirf #MukeshAmbani Company &amp; it's Worker...,False,False,True,False


In [99]:
print(df_6['annotation_56'].isnull().sum())
print(df_6['annotation_57'].isnull().sum())
print(df_6['annotation_58'].isnull().sum())
print(df_6['annotation_59'].isnull().sum())
print(df_6['text'].isnull().sum())

0
0
0
0
0


In [100]:
annotation_1= df_6['annotation_56']
annotation_2= df_6['annotation_57']
annotation_3= df_6['annotation_58']
annotation_4= df_6['annotation_59']

### Cohen_kappa average score of annotation 56

In [101]:
def cohen_avg_1():
    cohen_score_12 = cohen_kappa_score(annotation_1,annotation_2)
    cohen_score_13 = cohen_kappa_score(annotation_1,annotation_3)
    cohen_score_14 = cohen_kappa_score(annotation_1,annotation_4)
    cohen_score_average_1 = round((cohen_score_12 + cohen_score_13 + cohen_score_14)/3,3)
    #print("Average Cohen Kappa Score for annotation 56:",(cohen_score_average_1))
    return cohen_score_average_1

In [102]:
cohen_avg_1()

0.116

### Cohen_kappa average score of annotation 57

In [103]:
def cohen_avg_2():
    cohen_score_21 = cohen_kappa_score(annotation_2,annotation_1)
    cohen_score_23 = cohen_kappa_score(annotation_2,annotation_3)
    cohen_score_24 = cohen_kappa_score(annotation_2,annotation_4)
    cohen_score_average_2 = round((cohen_score_21 + cohen_score_23 + cohen_score_24)/3,3)
    #print("Average Cohen Kappa Score for annotation 57:",(cohen_score_average_2))
    return cohen_score_average_2

In [104]:
cohen_avg_2()

0.301

### Cohen_kappa average score of annotation 58

In [105]:
def cohen_avg_3():
    cohen_score_31 = cohen_kappa_score(annotation_3,annotation_1)
    cohen_score_32 = cohen_kappa_score(annotation_3,annotation_2)
    cohen_score_34 = cohen_kappa_score(annotation_3,annotation_4)
    cohen_score_average_3 = round((cohen_score_31 + cohen_score_32 + cohen_score_34)/3,3)
    #print("Average Cohen Kappa Score for annotation 58:",(cohen_score_average_3))
    return cohen_score_average_3

In [106]:
cohen_avg_3()

0.195

### Cohen_kappa average score of annotation 59

In [107]:
def cohen_avg_4():
    cohen_score_41 = cohen_kappa_score(annotation_4,annotation_1)
    cohen_score_42 = cohen_kappa_score(annotation_4,annotation_2)
    cohen_score_43 = cohen_kappa_score(annotation_4,annotation_3)
    cohen_score_average_4 = round((cohen_score_41 + cohen_score_42 + cohen_score_43)/3,3)
    #print("Average Cohen Kappa Score for annotation 59:",(cohen_score_average_4))
    return cohen_score_average_4

In [108]:
cohen_avg_4()

0.303

##### If average cohen kappa score for any annotation is less than 0.2, we will drop it

In [109]:
def drop_annotate():
    average = [cohen_avg_1(), cohen_avg_2(),cohen_avg_3(),cohen_avg_4()]
    columns_name = ['annotation_56','annotation_57','annotation_58','annotation_59']
    #columns =[annotation_1,annotation_2,annotation_3,annotation_4]
    for i in range(0,len(average)):
        if average[i]< 0.2:
            print(columns_name[i])
            print(average[i])
            df_6.drop(columns_name[i], axis=1, inplace = True)
    print(df_6.head())
    r = []
    for col in df_6.columns:
        r.append(col)
    print(r[1:])

In [110]:
drop_annotate()

annotation_56
0.116
annotation_58
0.195
                                                text  annotation_57  \
0  Federal Judge Rules Against CDC, Throws Out Cr...          False   
1  Indeed. Even in the dysfunctional US health sy...          False   
2  a vaccine seems to be ESSENTIAL if we are to s...          False   
3  Nurses union calls on CDC to reinstate univers...          False   
4  Sirf  #MukeshAmbani  Company &amp; it's Worker...          False   

   annotation_59  
0          False  
1          False  
2          False  
3          False  
4          False  
['annotation_57', 'annotation_59']


In [111]:
def updateLabel(df_6):
    for index, rows in df_6.iterrows():
        match = [[cohen_avg_2(),rows['annotation_57']],
                    [cohen_avg_4(),rows['annotation_59']]]
        boolean = [rows['annotation_57'],rows['annotation_59']]
        match.sort(key = lambda k:k[0])
        true = match.count('True')
        false = match.count('False')
        if true == false: 
            kapaValue, colValue = match[-1]
            df_6.loc[index, 'Final Label'] = colValue
            
        elif true!=false:
            if true > false:
                df_6.loc[index, 'Final Label'] = True
            else:
                df_6.loc[index, 'Final Label'] = False
    

In [112]:
updateLabel(df_6)
df_6 = df_6.iloc[:,[0,-1]]
df_6.head()

Unnamed: 0,text,Final Label
0,"Federal Judge Rules Against CDC, Throws Out Cr...",False
1,Indeed. Even in the dysfunctional US health sy...,False
2,a vaccine seems to be ESSENTIAL if we are to s...,False
3,Nurses union calls on CDC to reinstate univers...,False
4,Sirf #MukeshAmbani Company &amp; it's Worker...,False


In [113]:
df_6.to_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_2_lockdowns.csv',
           index = False)

## twitter_topic_2_masking_and_distancing.csv

In [114]:
df_7 = pd.read_csv(r'twitter_topic\twitter_topic_2_masking_and_distancing.csv')
df_7.head()

Unnamed: 0,text,annotation_56,annotation_57,annotation_58,annotation_59
0,"Federal Judge Rules Against CDC, Throws Out Cr...",False,False,True,True
1,Indeed. Even in the dysfunctional US health sy...,False,False,False,False
2,a vaccine seems to be ESSENTIAL if we are to s...,False,True,False,False
3,Nurses union calls on CDC to reinstate univers...,True,True,True,True
4,Sirf #MukeshAmbani Company &amp; it's Worker...,False,False,False,False


In [115]:
print(df_7['annotation_56'].isnull().sum())
print(df_7['annotation_57'].isnull().sum())
print(df_7['annotation_58'].isnull().sum())
print(df_7['annotation_59'].isnull().sum())
print(df_7['text'].isnull().sum())

0
0
0
0
0


In [116]:
annotation_1= df_7['annotation_56']
annotation_2= df_7['annotation_57']
annotation_3= df_7['annotation_58']
annotation_4= df_7['annotation_59']

### Cohen_kappa average score of annotation 56

In [117]:
def cohen_avg_1():
    cohen_score_12 = cohen_kappa_score(annotation_1,annotation_2)
    cohen_score_13 = cohen_kappa_score(annotation_1,annotation_3)
    cohen_score_14 = cohen_kappa_score(annotation_1,annotation_4)
    cohen_score_average_1 = round((cohen_score_12 + cohen_score_13 + cohen_score_14)/3,3)
    #print("Average Cohen Kappa Score for annotation 56:",(cohen_score_average_1))
    return cohen_score_average_1

In [118]:
cohen_avg_1()

0.635

### Cohen_kappa average score of annotation 57

In [119]:
def cohen_avg_2():
    cohen_score_21 = cohen_kappa_score(annotation_2,annotation_1)
    cohen_score_23 = cohen_kappa_score(annotation_2,annotation_3)
    cohen_score_24 = cohen_kappa_score(annotation_2,annotation_4)
    cohen_score_average_2 = round((cohen_score_21 + cohen_score_23 + cohen_score_24)/3,3)
    #print("Average Cohen Kappa Score for annotation 57:",(cohen_score_average_2))
    return cohen_score_average_2

In [120]:
cohen_avg_2()

0.603

### Cohen_kappa average score of annotation 58

In [121]:
def cohen_avg_3():
    cohen_score_31 = cohen_kappa_score(annotation_3,annotation_1)
    cohen_score_32 = cohen_kappa_score(annotation_3,annotation_2)
    cohen_score_34 = cohen_kappa_score(annotation_3,annotation_4)
    cohen_score_average_3 = round((cohen_score_31 + cohen_score_32 + cohen_score_34)/3,3)
    #print("Average Cohen Kappa Score for annotation 58:",(cohen_score_average_3))
    return cohen_score_average_3

In [122]:
cohen_avg_3()

0.564

### Cohen_kappa average score of annotation 59

In [123]:
def cohen_avg_4():
    cohen_score_41 = cohen_kappa_score(annotation_4,annotation_1)
    cohen_score_42 = cohen_kappa_score(annotation_4,annotation_2)
    cohen_score_43 = cohen_kappa_score(annotation_4,annotation_3)
    cohen_score_average_4 = round((cohen_score_41 + cohen_score_42 + cohen_score_43)/3,3)
    #print("Average Cohen Kappa Score for annotation 59:",(cohen_score_average_4))
    return cohen_score_average_4

In [124]:
cohen_avg_4()

0.663

##### If average cohen kappa score for any annotation is less than 0.2, we will drop it

In [125]:
def drop_annotate():
    average = [cohen_avg_1(), cohen_avg_2(),cohen_avg_3(),cohen_avg_4()]
    columns_name = ['annotation_56','annotation_57','annotation_58','annotation_59']
    #columns =[annotation_1,annotation_2,annotation_3,annotation_4]
    for i in range(0,len(average)):
        if average[i]< 0.2:
            print(columns_name[i])
            print(average[i])
            df_7.drop(columns_name[i], axis=1, inplace = True)
    print(df_7.head())
    r = []
    for col in df_7.columns:
        r.append(col)
    print(r[1:])

In [126]:
drop_annotate()

                                                text  annotation_56  \
0  Federal Judge Rules Against CDC, Throws Out Cr...          False   
1  Indeed. Even in the dysfunctional US health sy...          False   
2  a vaccine seems to be ESSENTIAL if we are to s...          False   
3  Nurses union calls on CDC to reinstate univers...           True   
4  Sirf  #MukeshAmbani  Company &amp; it's Worker...          False   

   annotation_57  annotation_58  annotation_59  
0          False           True           True  
1          False          False          False  
2           True          False          False  
3           True           True           True  
4          False          False          False  
['annotation_56', 'annotation_57', 'annotation_58', 'annotation_59']


In [127]:
def updateLabel(df_7):
    for index, rows in df_7.iterrows():
        match = [[cohen_avg_1(),rows['annotation_56']],
                   [cohen_avg_2(),rows['annotation_57']],
                   [cohen_avg_3(),rows['annotation_58']],
                    [cohen_avg_4(),rows['annotation_59']]]
        boolean = [rows['annotation_56'],rows['annotation_57'],rows['annotation_58'],rows['annotation_59']]
        match.sort(key = lambda k:k[0])
        true = match.count('True')
        false = match.count('False')
        if true == false: 
            kapaValue, colValue = match[-1]
            df_7.loc[index, 'Final Label'] = colValue
            
        elif true!=false:
            if true > false:
                df_7.loc[index, 'Final Label'] = True
            else:
                df_7.loc[index, 'Final Label'] = False
    

In [128]:
updateLabel(df_7)
df_7 = df_7.iloc[:,[0,-1]]
df_7.head()

Unnamed: 0,text,Final Label
0,"Federal Judge Rules Against CDC, Throws Out Cr...",True
1,Indeed. Even in the dysfunctional US health sy...,False
2,a vaccine seems to be ESSENTIAL if we are to s...,False
3,Nurses union calls on CDC to reinstate univers...,True
4,Sirf #MukeshAmbani Company &amp; it's Worker...,False


In [129]:
df_7.to_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_2_masking_and_distancing.csv',
           index = False)

## twitter_topic_2_vaccination.csv

In [130]:
df_8 = pd.read_csv(r'twitter_topic\twitter_topic_2_vaccination.csv')
df_8.head()

Unnamed: 0,text,annotation_56,annotation_57,annotation_58,annotation_59
0,"Federal Judge Rules Against CDC, Throws Out Cr...",False,False,False,False
1,Indeed. Even in the dysfunctional US health sy...,True,True,True,True
2,a vaccine seems to be ESSENTIAL if we are to s...,True,False,True,True
3,Nurses union calls on CDC to reinstate univers...,False,False,False,False
4,Sirf #MukeshAmbani Company &amp; it's Worker...,False,False,False,False


In [131]:
print(df_8['annotation_56'].isnull().sum())
print(df_8['annotation_57'].isnull().sum())
print(df_8['annotation_58'].isnull().sum())
print(df_8['annotation_59'].isnull().sum())
print(df_8['text'].isnull().sum())

0
0
0
0
0


In [132]:
annotation_1= df_8['annotation_56']
annotation_2= df_8['annotation_57']
annotation_3= df_8['annotation_58']
annotation_4= df_8['annotation_59']

### Cohen_kappa average score of annotation 56

In [133]:
def cohen_avg_1():
    cohen_score_12 = cohen_kappa_score(annotation_1,annotation_2)
    cohen_score_13 = cohen_kappa_score(annotation_1,annotation_3)
    cohen_score_14 = cohen_kappa_score(annotation_1,annotation_4)
    cohen_score_average_1 = round((cohen_score_12 + cohen_score_13 + cohen_score_14)/3,3)
    #print("Average Cohen Kappa Score for annotation 56:",(cohen_score_average_1))
    return cohen_score_average_1

In [134]:
cohen_avg_1()

0.69

### Cohen_kappa average score of annotation 57

In [135]:
def cohen_avg_2():
    cohen_score_21 = cohen_kappa_score(annotation_2,annotation_1)
    cohen_score_23 = cohen_kappa_score(annotation_2,annotation_3)
    cohen_score_24 = cohen_kappa_score(annotation_2,annotation_4)
    cohen_score_average_2 = round((cohen_score_21 + cohen_score_23 + cohen_score_24)/3,3)
    #print("Average Cohen Kappa Score for annotation 57:",(cohen_score_average_2))
    return cohen_score_average_2

In [136]:
cohen_avg_2()

0.695

### Cohen_kappa average score of annotation 58

In [137]:
def cohen_avg_3():
    cohen_score_31 = cohen_kappa_score(annotation_3,annotation_1)
    cohen_score_32 = cohen_kappa_score(annotation_3,annotation_2)
    cohen_score_34 = cohen_kappa_score(annotation_3,annotation_4)
    cohen_score_average_3 = round((cohen_score_31 + cohen_score_32 + cohen_score_34)/3,3)
    #print("Average Cohen Kappa Score for annotation 58:",(cohen_score_average_3))
    return cohen_score_average_3

In [138]:
cohen_avg_3()

0.585

### Cohen_kappa average score of annotation 59

In [139]:
def cohen_avg_4():
    cohen_score_41 = cohen_kappa_score(annotation_4,annotation_1)
    cohen_score_42 = cohen_kappa_score(annotation_4,annotation_2)
    cohen_score_43 = cohen_kappa_score(annotation_4,annotation_3)
    cohen_score_average_4 = round((cohen_score_41 + cohen_score_42 + cohen_score_43)/3,3)
    #print("Average Cohen Kappa Score for annotation 59:",(cohen_score_average_4))
    return cohen_score_average_4

In [140]:
cohen_avg_4()

0.737

##### If average cohen kappa score for any annotation is less than 0.2, we will drop it

In [141]:
def drop_annotate():
    average = [cohen_avg_1(), cohen_avg_2(),cohen_avg_3(),cohen_avg_4()]
    columns_name = ['annotation_56','annotation_57','annotation_58','annotation_59']
    #columns =[annotation_1,annotation_2,annotation_3,annotation_4]
    for i in range(0,len(average)):
        if average[i]< 0.2:
            print(columns_name[i])
            print(average[i])
            df_8.drop(columns_name[i], axis=1, inplace = True)
    print(df_8.head())
    r = []
    for col in df_8.columns:
        r.append(col)
    print(r[1:])

In [142]:
drop_annotate()

                                                text  annotation_56  \
0  Federal Judge Rules Against CDC, Throws Out Cr...          False   
1  Indeed. Even in the dysfunctional US health sy...           True   
2  a vaccine seems to be ESSENTIAL if we are to s...           True   
3  Nurses union calls on CDC to reinstate univers...          False   
4  Sirf  #MukeshAmbani  Company &amp; it's Worker...          False   

   annotation_57  annotation_58  annotation_59  
0          False          False          False  
1           True           True           True  
2          False           True           True  
3          False          False          False  
4          False          False          False  
['annotation_56', 'annotation_57', 'annotation_58', 'annotation_59']


In [143]:
def updateLabel(df_8):
    for index, rows in df_8.iterrows():
        match = [[cohen_avg_1(),rows['annotation_56']],
                   [cohen_avg_2(),rows['annotation_57']],
                   [cohen_avg_3(),rows['annotation_58']],
                    [cohen_avg_4(),rows['annotation_59']]]
        boolean = [rows['annotation_56'],rows['annotation_57'],rows['annotation_58'],rows['annotation_59']]
        match.sort(key = lambda k:k[0])
        true = match.count('True')
        false = match.count('False')
        if true == false: 
            kapaValue, colValue = match[-1]
            df_8.loc[index, 'Final Label'] = colValue
            
        elif true !=false:
            if true > false:
                df_8.loc[index, 'Final Label'] = True
            else:
                df_8.loc[index, 'Final Label'] = False
    

In [144]:
updateLabel(df_8)
df_8 = df_8.iloc[:,[0,-1]]
df_8.head()

Unnamed: 0,text,Final Label
0,"Federal Judge Rules Against CDC, Throws Out Cr...",False
1,Indeed. Even in the dysfunctional US health sy...,True
2,a vaccine seems to be ESSENTIAL if we are to s...,True
3,Nurses union calls on CDC to reinstate univers...,False
4,Sirf #MukeshAmbani Company &amp; it's Worker...,False


In [145]:
df_8.to_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_2_vaccination.csv',
           index = False)

## twitter_topic_3_lockdowns.csv

In [146]:
df_9 = pd.read_csv(r'twitter_topic\twitter_topic_3_lockdowns.csv')
df_9.head()

Unnamed: 0,text,annotation_91,annotation_85,annotation_86,annotation_87
0,Bad news. Johnson &amp; Johnson pauses #CovidV...,False,False,False,False
1,Saw this on TV and you women Need to work with...,False,False,False,True
2,Two Indian vaccine candidates against COVID-19...,False,False,False,False
3,The point is media &amp; govt lie abt the numb...,False,False,False,False
4,California students will continue wearing mask...,False,True,False,False


In [147]:
print(df_9['annotation_85'].isnull().sum())
print(df_9['annotation_86'].isnull().sum())
print(df_9['annotation_87'].isnull().sum())
print(df_9['annotation_91'].isnull().sum())
print(df_9['text'].isnull().sum())

0
0
0
0
0


In [148]:
annotation_1= df_9['annotation_85']
annotation_2= df_9['annotation_86']
annotation_3= df_9['annotation_87']
annotation_4= df_9['annotation_91']

### Cohen_kappa average score of annotation 85

In [149]:
def cohen_avg_1():
    cohen_score_12 = cohen_kappa_score(annotation_1,annotation_2)
    cohen_score_13 = cohen_kappa_score(annotation_1,annotation_3)
    cohen_score_14 = cohen_kappa_score(annotation_1,annotation_4)
    cohen_score_average_1 = round((cohen_score_12 + cohen_score_13 + cohen_score_14)/3,3)
    #print("Average Cohen Kappa Score for annotation 85:",(cohen_score_average_1))
    return cohen_score_average_1

In [150]:
cohen_avg_1()

0.218

### Cohen_kappa average score of annotation 86

In [151]:
def cohen_avg_2():
    cohen_score_21 = cohen_kappa_score(annotation_2,annotation_1)
    cohen_score_23 = cohen_kappa_score(annotation_2,annotation_3)
    cohen_score_24 = cohen_kappa_score(annotation_2,annotation_4)
    cohen_score_average_2 = round((cohen_score_21 + cohen_score_23 + cohen_score_24)/3,3)
    #print("Average Cohen Kappa Score for annotation 86:",(cohen_score_average_2))
    return cohen_score_average_2

In [152]:
cohen_avg_2()

0.192

### Cohen_kappa average score of annotation 87

In [153]:
def cohen_avg_3():
    cohen_score_31 = cohen_kappa_score(annotation_3,annotation_1)
    cohen_score_32 = cohen_kappa_score(annotation_3,annotation_2)
    cohen_score_34 = cohen_kappa_score(annotation_3,annotation_4)
    cohen_score_average_3 = round((cohen_score_31 + cohen_score_32 + cohen_score_34)/3,3)
    #print("Average Cohen Kappa Score for annotation 87:",(cohen_score_average_3))
    return cohen_score_average_3

In [154]:
cohen_avg_3()

0.261

### Cohen_kappa average score of annotation 91

In [155]:
def cohen_avg_4():
    cohen_score_41 = cohen_kappa_score(annotation_4,annotation_1)
    cohen_score_42 = cohen_kappa_score(annotation_4,annotation_2)
    cohen_score_43 = cohen_kappa_score(annotation_4,annotation_3)
    cohen_score_average_4 = round((cohen_score_41 + cohen_score_42 + cohen_score_43)/3,3)
    #print("Average Cohen Kappa Score for annotation 91:",(cohen_score_average_4))
    return cohen_score_average_4

In [156]:
cohen_avg_4()

0.328

##### If average cohen kappa score for any annotation is less than 0.2, we will drop it

In [157]:
def drop_annotate():
    average = [cohen_avg_1(), cohen_avg_2(),cohen_avg_3(),cohen_avg_4()]
    columns_name = ['annotation_85','annotation_86','annotation_87','annotation_91']
    for i in range(0,len(average)):
        if average[i]< 0.2:
            print(columns_name[i])
            print(average[i])
            df_9.drop(columns_name[i], axis=1, inplace = True)
    print(df_9.head())
    r = []
    for col in df_9.columns:
        r.append(col)
    print(r[1:])

In [158]:
drop_annotate()

annotation_86
0.192
                                                text  annotation_91  \
0  Bad news. Johnson &amp; Johnson pauses #CovidV...          False   
1  Saw this on TV and you women Need to work with...          False   
2  Two Indian vaccine candidates against COVID-19...          False   
3  The point is media &amp; govt lie abt the numb...          False   
4  California students will continue wearing mask...          False   

   annotation_85  annotation_87  
0          False          False  
1          False           True  
2          False          False  
3          False          False  
4           True          False  
['annotation_91', 'annotation_85', 'annotation_87']


In [159]:
def updateLabel(df_9):
    for index, rows in df_9.iterrows():
        match = [[cohen_avg_1(),rows['annotation_85']],
                   [cohen_avg_3(),rows['annotation_87']],
                   [cohen_avg_4(),rows['annotation_91']]]
        boolean = [rows['annotation_85'],rows['annotation_87'],rows['annotation_91']]
        match.sort(key = lambda k:k[0])
        true = match.count('True')
        false = match.count('False')
        if true == false: 
            kapaValue, colValue = match[-1]
            df_9.loc[index, 'Final Label'] = colValue
            
        elif true!=false:
            if true > false:
                df_9.loc[index, 'Final Label'] = True
            else:
                df_9.loc[index, 'Final Label'] = False
    

In [160]:
updateLabel(df_9)
df_9 = df_9.iloc[:,[0,-1]]
df_9.head()

Unnamed: 0,text,Final Label
0,Bad news. Johnson &amp; Johnson pauses #CovidV...,False
1,Saw this on TV and you women Need to work with...,False
2,Two Indian vaccine candidates against COVID-19...,False
3,The point is media &amp; govt lie abt the numb...,False
4,California students will continue wearing mask...,False


In [161]:
df_9.to_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_3_lockdowns.csv',
           index = False)

## twitter_topic_3_masking_and_distancing.csv

In [162]:
df_10 = pd.read_csv(r'twitter_topic\twitter_topic_3_masking_and_distancing.csv')
df_10.head()

Unnamed: 0,text,annotation_91,annotation_85,annotation_86,annotation_87
0,Bad news. Johnson &amp; Johnson pauses #CovidV...,False,False,False,False
1,Saw this on TV and you women Need to work with...,True,True,True,True
2,Two Indian vaccine candidates against COVID-19...,False,False,False,False
3,The point is media &amp; govt lie abt the numb...,False,False,False,False
4,California students will continue wearing mask...,True,True,True,True


In [163]:
print(df_10['annotation_85'].isnull().sum())
print(df_10['annotation_86'].isnull().sum())
print(df_10['annotation_87'].isnull().sum())
print(df_10['annotation_91'].isnull().sum())
print(df_10['text'].isnull().sum())

0
0
0
0
0


In [164]:
annotation_1= df_10['annotation_85']
annotation_2= df_10['annotation_86']
annotation_3= df_10['annotation_87']
annotation_4= df_10['annotation_91']

### Cohen_kappa average score of annotation 85

In [165]:
def cohen_avg_1():
    cohen_score_12 = cohen_kappa_score(annotation_1,annotation_2)
    cohen_score_13 = cohen_kappa_score(annotation_1,annotation_3)
    cohen_score_14 = cohen_kappa_score(annotation_1,annotation_4)
    cohen_score_average_1 = round((cohen_score_12 + cohen_score_13 + cohen_score_14)/3,3)
    #print("Average Cohen Kappa Score for annotation 85:",(cohen_score_average_1))
    return cohen_score_average_1

In [166]:
cohen_avg_1()

0.803

### Cohen_kappa average score of annotation 86

In [167]:
def cohen_avg_2():
    cohen_score_21 = cohen_kappa_score(annotation_2,annotation_1)
    cohen_score_23 = cohen_kappa_score(annotation_2,annotation_3)
    cohen_score_24 = cohen_kappa_score(annotation_2,annotation_4)
    cohen_score_average_2 = round((cohen_score_21 + cohen_score_23 + cohen_score_24)/3,3)
    #print("Average Cohen Kappa Score for annotation 86:",(cohen_score_average_2))
    return cohen_score_average_2

In [168]:
cohen_avg_2()

0.856

### Cohen_kappa average score of annotation 87

In [169]:
def cohen_avg_3():
    cohen_score_31 = cohen_kappa_score(annotation_3,annotation_1)
    cohen_score_32 = cohen_kappa_score(annotation_3,annotation_2)
    cohen_score_34 = cohen_kappa_score(annotation_3,annotation_4)
    cohen_score_average_3 = round((cohen_score_31 + cohen_score_32 + cohen_score_34)/3,3)
    #print("Average Cohen Kappa Score for annotation 87:",(cohen_score_average_3))
    return cohen_score_average_3

In [170]:
cohen_avg_3()

0.814

### Cohen_kappa average score of annotation 91

In [171]:
def cohen_avg_4():
    cohen_score_41 = cohen_kappa_score(annotation_4,annotation_1)
    cohen_score_42 = cohen_kappa_score(annotation_4,annotation_2)
    cohen_score_43 = cohen_kappa_score(annotation_4,annotation_3)
    cohen_score_average_4 = round((cohen_score_41 + cohen_score_42 + cohen_score_43)/3,3)
    #print("Average Cohen Kappa Score for annotation 91:",(cohen_score_average_4))
    return cohen_score_average_4

In [172]:
cohen_avg_4()

0.832

##### If average cohen kappa score for any annotation is less than 0.2, we will drop it

In [173]:
def drop_annotate():
    average = [cohen_avg_1(), cohen_avg_2(),cohen_avg_3(),cohen_avg_4()]
    columns_name = ['annotation_85','annotation_86','annotation_87','annotation_91']
    #columns =[annotation_1,annotation_2,annotation_3,annotation_4]
    for i in range(0,len(average)):
        if average[i]< 0.2:
            print(columns_name[i])
            print(average[i])
            df_10.drop(columns_name[i], axis=1, inplace = True)
    print(df_10.head())
    r = []
    for col in df_10.columns:
        r.append(col)
    print(r[1:])

In [174]:
drop_annotate()

                                                text  annotation_91  \
0  Bad news. Johnson &amp; Johnson pauses #CovidV...          False   
1  Saw this on TV and you women Need to work with...           True   
2  Two Indian vaccine candidates against COVID-19...          False   
3  The point is media &amp; govt lie abt the numb...          False   
4  California students will continue wearing mask...           True   

   annotation_85  annotation_86  annotation_87  
0          False          False          False  
1           True           True           True  
2          False          False          False  
3          False          False          False  
4           True           True           True  
['annotation_91', 'annotation_85', 'annotation_86', 'annotation_87']


In [175]:
def updateLabel(df_10):
    for index, rows in df_10.iterrows():
        match = [[cohen_avg_1(),rows['annotation_85']],
                   [cohen_avg_2(),rows['annotation_86']],
                   [cohen_avg_3(),rows['annotation_87']],
                   [cohen_avg_4(),rows['annotation_91']]]
        boolean = [rows['annotation_85'],rows['annotation_86'],rows['annotation_87'],rows['annotation_91']]
        match.sort(key = lambda k:k[0])
        true = match.count('True')
        false = match.count('False')
        if true == false: 
            kapaValue, colValue = match[-1]
            df_10.loc[index, 'Final Label'] = colValue
            
        elif true!=false:
            if true > false:
                df_10.loc[index, 'Final Label'] = True
            else:
                df_10.loc[index, 'Final Label'] = False
    

In [176]:
updateLabel(df_10)
df_10 = df_10.iloc[:,[0,-1]]
df_10.head()

Unnamed: 0,text,Final Label
0,Bad news. Johnson &amp; Johnson pauses #CovidV...,False
1,Saw this on TV and you women Need to work with...,True
2,Two Indian vaccine candidates against COVID-19...,False
3,The point is media &amp; govt lie abt the numb...,False
4,California students will continue wearing mask...,True


In [177]:
df_10.to_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_3_masking_and_distancing.csv',
           index = False)

## twitter_topic_3_vaccination.csv



In [178]:
df_11 = pd.read_csv(r'twitter_topic\twitter_topic_3_vaccination.csv')
df_11.head()

Unnamed: 0,text,annotation_91,annotation_85,annotation_86,annotation_87
0,Bad news. Johnson &amp; Johnson pauses #CovidV...,True,True,True,True
1,Saw this on TV and you women Need to work with...,False,False,False,False
2,Two Indian vaccine candidates against COVID-19...,True,True,True,True
3,The point is media &amp; govt lie abt the numb...,True,True,True,True
4,California students will continue wearing mask...,False,False,False,False


In [179]:
print(df_11['annotation_85'].isnull().sum())
print(df_11['annotation_86'].isnull().sum())
print(df_11['annotation_87'].isnull().sum())
print(df_11['annotation_91'].isnull().sum())
print(df_11['text'].isnull().sum())

0
0
0
0
0


In [180]:
annotation_1= df_11['annotation_85']
annotation_2= df_11['annotation_86']
annotation_3= df_11['annotation_87']
annotation_4= df_11['annotation_91']

### Cohen_kappa average score of annotation 85

In [181]:
def cohen_avg_1():
    cohen_score_12 = cohen_kappa_score(annotation_1,annotation_2)
    cohen_score_13 = cohen_kappa_score(annotation_1,annotation_3)
    cohen_score_14 = cohen_kappa_score(annotation_1,annotation_4)
    cohen_score_average_1 = round((cohen_score_12 + cohen_score_13 + cohen_score_14)/3,3)
    #print("Average Cohen Kappa Score for annotation 85:",(cohen_score_average_1))
    return cohen_score_average_1

In [182]:
cohen_avg_1()

0.658

### Cohen_kappa average score of annotation 86

In [183]:
def cohen_avg_2():
    cohen_score_21 = cohen_kappa_score(annotation_2,annotation_1)
    cohen_score_23 = cohen_kappa_score(annotation_2,annotation_3)
    cohen_score_24 = cohen_kappa_score(annotation_2,annotation_4)
    cohen_score_average_2 = round((cohen_score_21 + cohen_score_23 + cohen_score_24)/3,3)
    #print("Average Cohen Kappa Score for annotation 86:",(cohen_score_average_2))
    return cohen_score_average_2

In [184]:
cohen_avg_2()

0.621

### Cohen_kappa average score of annotation 87

In [185]:
def cohen_avg_3():
    cohen_score_31 = cohen_kappa_score(annotation_3,annotation_1)
    cohen_score_32 = cohen_kappa_score(annotation_3,annotation_2)
    cohen_score_34 = cohen_kappa_score(annotation_3,annotation_4)
    cohen_score_average_3 = round((cohen_score_31 + cohen_score_32 + cohen_score_34)/3,3)
    #print("Average Cohen Kappa Score for annotation 87:",(cohen_score_average_3))
    return cohen_score_average_3

In [186]:
cohen_avg_3()

0.636

### Cohen_kappa average score of annotation 91

In [187]:
def cohen_avg_4():
    cohen_score_41 = cohen_kappa_score(annotation_4,annotation_1)
    cohen_score_42 = cohen_kappa_score(annotation_4,annotation_2)
    cohen_score_43 = cohen_kappa_score(annotation_4,annotation_3)
    cohen_score_average_4 = round((cohen_score_41 + cohen_score_42 + cohen_score_43)/3,3)
    #print("Average Cohen Kappa Score for annotation 91:",(cohen_score_average_4))
    return cohen_score_average_4

In [188]:
cohen_avg_4()

0.668

##### If average cohen kappa score for any annotation is less than 0.2, we will drop it

In [189]:
def drop_annotate():
    average = [cohen_avg_1(), cohen_avg_2(),cohen_avg_3(),cohen_avg_4()]
    columns_name = ['annotation_85','annotation_86','annotation_87','annotation_91']
    #columns =[annotation_1,annotation_2,annotation_3,annotation_4]
    for i in range(0,len(average)):
        if average[i]< 0.2:
            print(columns_name[i])
            print(average[i])
            df_11.drop(columns_name[i], axis=1, inplace = True)
    print(df_11.head())
    r = []
    for col in df_11.columns:
        r.append(col)
    print(r[1:])

In [190]:
drop_annotate()

                                                text  annotation_91  \
0  Bad news. Johnson &amp; Johnson pauses #CovidV...           True   
1  Saw this on TV and you women Need to work with...          False   
2  Two Indian vaccine candidates against COVID-19...           True   
3  The point is media &amp; govt lie abt the numb...           True   
4  California students will continue wearing mask...          False   

   annotation_85  annotation_86  annotation_87  
0           True           True           True  
1          False          False          False  
2           True           True           True  
3           True           True           True  
4          False          False          False  
['annotation_91', 'annotation_85', 'annotation_86', 'annotation_87']


In [191]:
def updateLabel(df_11):
    for index, rows in df_11.iterrows():
        match = [[cohen_avg_1(),rows['annotation_85']],
                   [cohen_avg_2(),rows['annotation_86']],
                   [cohen_avg_3(),rows['annotation_87']],
                   [cohen_avg_4(),rows['annotation_91']]]
        boolean = [rows['annotation_85'],rows['annotation_86'],rows['annotation_87'],rows['annotation_91']]
        match.sort(key = lambda k:k[0])
        true = match.count('True')
        false = match.count('False')
        if true == false: 
            kapaValue, colValue = match[-1]
            df_11.loc[index, 'Final Label'] = colValue
            
        elif true!=false:
            if true > false:
                df_11.loc[index, 'Final Label'] = True
            else:
                df_11.loc[index, 'Final Label'] = False
    

In [192]:
updateLabel(df_11)
df_11 = df_11.iloc[:,[0,-1]]
df_11.head()

Unnamed: 0,text,Final Label
0,Bad news. Johnson &amp; Johnson pauses #CovidV...,True
1,Saw this on TV and you women Need to work with...,False
2,Two Indian vaccine candidates against COVID-19...,True
3,The point is media &amp; govt lie abt the numb...,True
4,California students will continue wearing mask...,False


In [193]:
df_11.to_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_3_vaccination.csv',
           index = False)

## Merging twitter_topic_lockdowns

In [194]:
df = pd.read_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_0_lockdowns.csv', sep=',', header=None,index_col = False)
df1 = pd.read_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_1_lockdowns.csv', sep=',', header=None,index_col = False)
df2 = pd.read_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_2_lockdowns.csv', sep=',', header=None,index_col = False)
df3 = pd.read_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_3_lockdowns.csv', sep=',', header=None,index_col = False)

In [195]:
df.reset_index(drop=True)
df1.reset_index( drop=True)
df2.reset_index(drop=True)
df3.reset_index(drop=True)
df_combined = pd.concat([df,df1,df2,df3], axis=0)

In [196]:
df_combined.to_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary_final\twitter_topic_lockdowns.csv',index= False,header = False)

In [197]:
df_combined.describe()

Unnamed: 0,0,1
count,1204,1204
unique,1196,3
top,STUDIES SHOW MORPHINE MILLIGRAM EQUIVALENT (MM...,False
freq,4,1145


## Merging twitter_topic_masking_and_distancing

In [198]:
df = pd.read_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_0_masking_and_distancing.csv', sep=',', header=None,index_col = False)
df1 = pd.read_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_1_masking_and_distancing.csv', sep=',', header=None,index_col = False)
df2 = pd.read_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_2_masking_and_distancing.csv', sep=',', header=None,index_col = False)
df3 = pd.read_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_3_masking_and_distancing.csv', sep=',', header=None,index_col = False)

In [199]:
df.reset_index(drop=True)
df1.reset_index( drop=True)
df2.reset_index(drop=True)
df3.reset_index(drop=True)
df_combined = pd.concat([df,df1,df2,df3], axis=0)

In [200]:
df_combined.to_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary_final\twitter_topic_masking_and_distancing.csv',index= False,header = False)

## Merging twitter_topic_vaccination


In [201]:
df = pd.read_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_0_vaccination.csv', sep=',', header=None,index_col = False)
df1 = pd.read_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_1_vaccination.csv', sep=',', header=None,index_col = False)
df2 = pd.read_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_2_vaccination.csv', sep=',', header=None,index_col = False)
df3 = pd.read_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary\twitter_topic_3_vaccination.csv', sep=',', header=None,index_col = False)

In [202]:
df.reset_index(drop=True)
df1.reset_index( drop=True)
df2.reset_index(drop=True)
df3.reset_index(drop=True)
df_combined = pd.concat([df,df1,df2,df3], axis=0)

In [203]:
df_combined.to_csv(r'C:\Users\stuar\Desktop\NLP Assignment 2\new_files\primary_final\twitter_topic_vaccination.csv',index= False,header = False)