In [24]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# Load the dataset
file_path = '/home/aaronbry/my_env/data/dev.json'
data = pd.read_json(file_path)

# Define a list of common emotional words
#emotional_words = ["love", "hate", "wonderful", "excess", "terrible", "fantastic", "awful", "great", "bad", "amazing", "horrible", "excellent", "poor", "disaster"]
emotional_words = [
    'more',
    'most',
    'higher',
    'biggest',
    'best',
    'cheaper',
    'lower',
    'fewer',
    'less',
    'nigeria',
    'bigger',
    'greatest',
    'speaker',
    'worse',
    'older',
    'least',
    'saudi',
    'lighter',
    'fastest',
    'wildfire',
    'greater',
    'highest',
    'richest'
]

# Preprocess the text to make it easier to count the emotional words
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    return tokens

# Function to count the number of emotional words in each claim
def count_emotional_words(tokens, emotional_words):
    count = sum(1 for word in tokens if word in emotional_words)
    return count

# Apply preprocessing and count emotional words
data['tokens'] = data['claim'].apply(preprocess_text)
data['emotional_word_count'] = data['tokens'].apply(lambda tokens: count_emotional_words(tokens, emotional_words))

# Display the DataFrame with the new columns
print(data[['claim', 'label', 'emotional_word_count']].head())

# Analyze the correlation between emotional word count and the labels
emotional_word_count_analysis = data.groupby('label')['emotional_word_count'].mean()
print(emotional_word_count_analysis)


                                               claim    label  \
0  In a letter to Steve Jobs, Sean Connery refuse...  Refuted   
1  Trump Administration claimed songwriter Billie...  Refuted   
2  Due to Imran Khan's criticism of Macron's comm...  Refuted   
3  UNESCO declared Nadar community as the most an...  Refuted   
4  Republican Matt Gaetz was part of a company th...  Refuted   

   emotional_word_count  
0                     0  
1                     0  
2                     0  
3                     1  
4                     0  
label
Conflicting Evidence/Cherrypicking    0.131579
Not Enough Evidence                   0.857143
Refuted                               0.131148
Supported                             0.327869
Name: emotional_word_count, dtype: float64


V2

In [29]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter


# Load the dataset
file_path = '/home/aaronbry/my_env/data/dev.json'
data = pd.read_json(file_path)

# Define the list of common emotional words
emotional_words_list = [
    'more', 'most', 'higher', 'biggest', 'best', 'cheaper', 'lower', 'fewer',
    'less', 'nigeria', 'bigger', 'greatest', 'speaker', 'worse', 'older',
    'least', 'saudi', 'lighter', 'fastest', 'wildfire', 'greater', 'highest',
    'richest'
]

# Preprocess the text to make it easier to count the emotional words
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    return tokens

# Function to count the number of emotional words in each claim
def count_emotional_words(tokens, emotional_words):
    count = sum(1 for word in tokens if word in emotional_words)
    return count

# Apply preprocessing and count emotional words
data['tokens'] = data['claim'].apply(preprocess_text)
data['emotional_word_count'] = data['tokens'].apply(lambda tokens: count_emotional_words(tokens, emotional_words_list))

# Display the DataFrame with the new columns
print(data[['claim', 'label', 'emotional_word_count']].head())

# Analyze the correlation between emotional word count and the labels
emotional_word_count_analysis = data.groupby('label')['emotional_word_count'].mean()
print(emotional_word_count_analysis)


                                               claim    label  \
0  In a letter to Steve Jobs, Sean Connery refuse...  Refuted   
1  Trump Administration claimed songwriter Billie...  Refuted   
2  Due to Imran Khan's criticism of Macron's comm...  Refuted   
3  UNESCO declared Nadar community as the most an...  Refuted   
4  Republican Matt Gaetz was part of a company th...  Refuted   

   emotional_word_count  
0                     0  
1                     0  
2                     0  
3                     1  
4                     0  
label
Conflicting Evidence/Cherrypicking    0.131579
Not Enough Evidence                   0.857143
Refuted                               0.131148
Supported                             0.327869
Name: emotional_word_count, dtype: float64


In [21]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
#nltk.download('punkt')
# Download necessary NLTK data
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')

# Load the dataset
file_path = '/home/aaronbry/my_env/data/dev.json'
data = pd.read_json(file_path)

# Preprocess the text to make it easier to extract words
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    return tokens

# Apply preprocessing
data['tokens'] = data['claim'].apply(preprocess_text)

# Flatten all tokens into a single list
all_tokens = [token for tokens in data['tokens'] for token in tokens]

# POS tagging to filter adjectives and emotional words
tagged_tokens = nltk.pos_tag(all_tokens)
adjective_words = [word for word, pos in tagged_tokens if pos in ['JJR', 'JJS', 'RB' 'RBR', 'RBS']]

# Frequency distribution of adjectives/emotional words
freq_dist = Counter(adjective_words)

# Get the 100 most common adjectives/emotional words
most_common_adjectives = freq_dist.most_common(100)

# Convert to DataFrame for better readability
most_common_adjectives_df = pd.DataFrame(most_common_adjectives, columns=['Word', 'Frequency'])
print(most_common_adjectives_df)


        Word  Frequency
0       more         11
1       most          8
2     higher          5
3    biggest          4
4       best          3
5    cheaper          3
6      lower          3
7      fewer          2
8       less          2
9    nigeria          2
10    bigger          1
11  greatest          1
12   speaker          1
13     worse          1
14     older          1
15     least          1
16     saudi          1
17   lighter          1
18   fastest          1
19  wildfire          1
20   greater          1
21   highest          1
22   richest          1


In [42]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

# Load the dataset
file_path = 'my_env/data/dev.json'  # Change to your path
data = pd.read_json(file_path)

# Preprocess the text to make it easier to extract words
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    return tokens

# Apply preprocessing
data['tokens'] = data['claim'].apply(preprocess_text)

# Flatten all tokens into a single list
all_tokens = [token for tokens in data['tokens'] for token in tokens]

# POS tagging to filter adjectives and emotional words
tagged_tokens = nltk.pos_tag(all_tokens)
adjective_words = [word for word, pos in tagged_tokens if pos in ['JJR', 'JJS', 'RBR', 'RBS']]

# Frequency distribution of adjectives/emotional words
freq_dist = Counter(adjective_words)

# Get the 100 most common adjectives/emotional words
most_common_adjectives = freq_dist.most_common(100)

# Convert to DataFrame for better readability
most_common_adjectives_df = pd.DataFrame(most_common_adjectives, columns=['Word', 'Frequency'])
print(most_common_adjectives_df)

# Create a set of the most common adjectives for quick lookup
most_common_adjectives_set = set([word for word, _ in most_common_adjectives])

# Function to count the number of most common adjectives in each claim
def count_common_adjectives(tokens, common_adjectives):
    count = sum(1 for word in tokens if word in common_adjectives)
    return count

# Apply preprocessing and count common adjectives
data['common_adjective_count'] = data['tokens'].apply(lambda tokens: count_common_adjectives(tokens, most_common_adjectives_set))

# Display the DataFrame with the new columns
print(data[['claim', 'label', 'common_adjective_count']].head())

# Analyze the correlation between common adjective count and the labels
common_adjective_count_analysis = data.groupby('label')['common_adjective_count'].mean()
print(common_adjective_count_analysis)


        Word  Frequency
0       more         19
1       most          8
2     higher          5
3    biggest          4
4       less          3
5       best          3
6    cheaper          3
7      lower          3
8      fewer          2
9    nigeria          2
10    better          2
11    bigger          1
12  greatest          1
13   speaker          1
14    hunter          1
15     worse          1
16     older          1
17     least          1
18     saudi          1
19   lighter          1
20   fastest          1
21  wildfire          1
22   greater          1
23    matter          1
24   highest          1
25   richest          1
                                               claim    label  \
0  In a letter to Steve Jobs, Sean Connery refuse...  Refuted   
1  Trump Administration claimed songwriter Billie...  Refuted   
2  Due to Imran Khan's criticism of Macron's comm...  Refuted   
3  UNESCO declared Nadar community as the most an...  Refuted   
4  Republican Matt Gaetz wa