## Use common adjs to find pattern in label *ignore

In [126]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# Load the dataset
file_path = '/home/aaronbry/my_env/data/dev.json'
data = pd.read_json(file_path)

# Define a list of common emotional words
#emotional_words = ["love", "hate", "wonderful", "excess", "terrible", "fantastic", "awful", "great", "bad", "amazing", "horrible", "excellent", "poor", "disaster"]
emotional_words = [
    'more',
    'most',
    'higher',
    'biggest',
    'best',
    'cheaper',
    'lower',
    'fewer',
    'less',
    'nigeria',
    'bigger',
    'greatest',
    'speaker',
    'worse',
    'older',
    'least',
    'saudi',
    'lighter',
    'fastest',
    'wildfire',
    'greater',
    'highest',
    'richest'
]

# Preprocess the text to make it easier to count the emotional words
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    return tokens

# Function to count the number of emotional words in each claim
def count_emotional_words(tokens, emotional_words):
    count = sum(1 for word in tokens if word in emotional_words)
    return count

# Apply preprocessing and count emotional words
data['tokens'] = data['claim'].apply(preprocess_text)
data['emotional_word_count'] = data['tokens'].apply(lambda tokens: count_emotional_words(tokens, emotional_words))

# Display the DataFrame with the new columns
print(data[['claim', 'label', 'emotional_word_count']].head())

# Analyze the correlation between emotional word count and the labels
emotional_word_count_analysis = data.groupby('label')['emotional_word_count'].mean()
print(emotional_word_count_analysis)


                                               claim    label  \
0  In a letter to Steve Jobs, Sean Connery refuse...  Refuted   
1  Trump Administration claimed songwriter Billie...  Refuted   
2  Due to Imran Khan's criticism of Macron's comm...  Refuted   
3  UNESCO declared Nadar community as the most an...  Refuted   
4  Republican Matt Gaetz was part of a company th...  Refuted   

   emotional_word_count  
0                     0  
1                     0  
2                     0  
3                     1  
4                     0  
label
Conflicting Evidence/Cherrypicking    0.131579
Not Enough Evidence                   0.857143
Refuted                               0.131148
Supported                             0.327869
Name: emotional_word_count, dtype: float64


## V2 *ignore

In [127]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter


# Load the dataset
file_path = '/home/aaronbry/my_env/data/dev.json'
data = pd.read_json(file_path)

# Define the list of common emotional words
emotional_words_list = [
    'more', 'most', 'higher', 'biggest', 'best', 'cheaper', 'lower', 'fewer',
    'less', 'nigeria', 'bigger', 'greatest', 'speaker', 'worse', 'older',
    'least', 'saudi', 'lighter', 'fastest', 'wildfire', 'greater', 'highest',
    'richest'
]

# Preprocess the text to make it easier to count the emotional words
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    return tokens

# Function to count the number of emotional words in each claim
def count_emotional_words(tokens, emotional_words):
    count = sum(1 for word in tokens if word in emotional_words)
    return count

# Apply preprocessing and count emotional words
data['tokens'] = data['claim'].apply(preprocess_text)
data['emotional_word_count'] = data['tokens'].apply(lambda tokens: count_emotional_words(tokens, emotional_words_list))

# Display the DataFrame with the new columns
print(data[['claim', 'label', 'emotional_word_count']].head())

# Analyze the correlation between emotional word count and the labels
emotional_word_count_analysis = data.groupby('label')['emotional_word_count'].mean()
print(emotional_word_count_analysis)


                                               claim    label  \
0  In a letter to Steve Jobs, Sean Connery refuse...  Refuted   
1  Trump Administration claimed songwriter Billie...  Refuted   
2  Due to Imran Khan's criticism of Macron's comm...  Refuted   
3  UNESCO declared Nadar community as the most an...  Refuted   
4  Republican Matt Gaetz was part of a company th...  Refuted   

   emotional_word_count  
0                     0  
1                     0  
2                     0  
3                     1  
4                     0  
label
Conflicting Evidence/Cherrypicking    0.131579
Not Enough Evidence                   0.857143
Refuted                               0.131148
Supported                             0.327869
Name: emotional_word_count, dtype: float64


## Dynamic Adjective / Adverb frequency

In [128]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
#nltk.download('punkt')
# Download necessary NLTK data
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')

# Load the dataset
file_path = '/home/aaronbry/my_env/data/dev.json'
data = pd.read_json(file_path)

# Preprocess the text to make it easier to extract words
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    return tokens

# Apply preprocessing
data['tokens'] = data['claim'].apply(preprocess_text)

# Flatten all tokens into a single list
all_tokens = [token for tokens in data['tokens'] for token in tokens]

# POS tagging to filter adjectives and emotional words
tagged_tokens = nltk.pos_tag(all_tokens)
adjective_words = [word for word, pos in tagged_tokens if pos in ['JJR', 'JJS', 'RB' 'RBR', 'RBS']]

# Frequency distribution of adjectives/emotional words
freq_dist = Counter(adjective_words)

# Get the 100 most common adjectives/emotional words
most_common_adjectives = freq_dist.most_common(100)

# Convert to DataFrame for better readability
most_common_adjectives_df = pd.DataFrame(most_common_adjectives, columns=['Word', 'Frequency'])
print(most_common_adjectives_df)


        Word  Frequency
0       more         11
1       most          8
2     higher          5
3    biggest          4
4       best          3
5    cheaper          3
6      lower          3
7      fewer          2
8       less          2
9    nigeria          2
10    bigger          1
11  greatest          1
12   speaker          1
13     worse          1
14     older          1
15     least          1
16     saudi          1
17   lighter          1
18   fastest          1
19  wildfire          1
20   greater          1
21   highest          1
22   richest          1


## Use frequency of POS to identify most common words (adjs, adv, modifiers) in claim

In [129]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

# Load the dataset
file_path = 'my_env/data/train.json'  # Change to your path
data = pd.read_json(file_path)

# Preprocess the text to make it easier to extract words
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    return tokens

# Apply preprocessing
data['tokens'] = data['claim'].apply(preprocess_text)

# Flatten all tokens into a single list
all_tokens = [token for tokens in data['tokens'] for token in tokens]

# POS tagging to filter adjectives and emotional words
tagged_tokens = nltk.pos_tag(all_tokens)
selected_pos = ['JJR']
#['JJR', 'JJS', 'RB', 'RBR', 'RBS', 'MD', 'UH']
adjective_words = [word for word, pos in tagged_tokens if pos in selected_pos]

# Frequency distribution of adjectives/emotional words
freq_dist = Counter(adjective_words)

# Get the 100 most common adjectives/emotional words
most_common_adjectives = freq_dist.most_common(50)

# Convert to DataFrame for better readability
most_common_adjectives_df = pd.DataFrame(most_common_adjectives, columns=['Word', 'Frequency'])
print(most_common_adjectives_df[:25])

# Create a set of the most common adjectives for quick lookup
most_common_adjectives_set = set([word for word, _ in most_common_adjectives])

# Function to count the number of most common adjectives in each claim
def count_common_adjectives(tokens, common_adjectives):
    count = sum(1 for word in tokens if word in common_adjectives)
    return count

# Apply preprocessing and count common adjectives
data['common_adjective_count'] = data['tokens'].apply(lambda tokens: count_common_adjectives(tokens, most_common_adjectives_set))

# Display the DataFrame with the new columns
print(data[['claim', 'label', 'common_adjective_count']].head(20))

# Analyze the correlation between common adjective count and the labels
common_adjective_count_analysis = data.groupby('label')['common_adjective_count'].mean()
print(selected_pos)
print(common_adjective_count_analysis)


        Word  Frequency
0       more        110
1       less         27
2      lower         18
3     higher         16
4      fewer         11
5     better          8
6     longer          3
7      older          3
8    younger          3
9    smaller          3
10   greater          2
11      lakh          2
12  offender          1
13    cooper          1
14   cheaper          1
15    easier          1
16   tougher          1
17   soldier          1
18      paid          1
19    muslim          1
20    bigger          1
21     crore          1
22     roger          1
                                                claim  \
0   Hunter Biden had no experience in Ukraine or i...   
1   Donald Trump delivered the largest tax cuts in...   
2   In Nigeria … in terms of revenue share, 20% go...   
3   Biden has pledged to stop border wall construc...   
4   After the police shooting of Jacob Blake, Gov....   
5   The Common Law Admission Test (CLAT) 2020 will...   
6         35% of revenue 

In [130]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

# Load the dataset
file_path = 'my_env/data/train.json'  # Change to your path
data = pd.read_json(file_path)

# Preprocess the text to make it easier to extract words
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    return tokens

# Apply preprocessing
data['tokens'] = data['claim'].apply(preprocess_text)

# Function to extract POS tags
def extract_pos_tags(tokens):
    return nltk.pos_tag(tokens)

data['pos_tags'] = data['tokens'].apply(extract_pos_tags)

# Function to count POS tags in each claim
def count_pos_tags(pos_tags):
    pos_counts = Counter(pos for word, pos in pos_tags)
    return pos_counts

# Apply function to count POS tags
data['pos_counts'] = data['pos_tags'].apply(count_pos_tags)

# Flatten all POS tags into a single list for overall frequency
all_pos_tags = [pos for pos_counts in data['pos_counts'] for pos in pos_counts]
overall_pos_freq = Counter(all_pos_tags)

# Display the overall most common POS tags
print("Overall Most Common POS Tags:")
print(overall_pos_freq.most_common(10))

# Aggregate results by label
def sum_counters(counters):
    sum_counter = Counter()
    for counter in counters:
        if isinstance(counter, Counter):  # Ensure that we only update with Counter objects
            sum_counter.update(counter)
    return sum_counter

# Apply sum_counters function to ensure proper aggregation
label_pos_counts = data.groupby('label')['pos_counts'].apply(lambda x: sum_counters(x))

# Ensure each entry in label_pos_counts is a Counter object
label_pos_counts = {label: sum_counters(group) for label, group in data.groupby('label')['pos_counts']}

# Display the top 5 POS tags per label
for label, pos_counts in label_pos_counts.items():
    print(f"\nTop 5 POS Tags for '{label}':")
    if isinstance(pos_counts, Counter):
        print(pos_counts.most_common(5))
    else:
        print("No POS counts available for this label.")

# Function to get top N POS tags per label
def top_n_pos_tags_per_label(data, n=5):
    label_pos_counts = {label: sum_counters(group) for label, group in data.groupby('label')['pos_counts']}
    top_pos_tags = {label: counts.most_common(n) for label, counts in label_pos_counts.items() if isinstance(counts, Counter)}
    return top_pos_tags

# Get the top 5 POS tags per label
top_pos_tags = top_n_pos_tags_per_label(data, n=5)

# Display the top 5 POS tags per label
for label, pos_tags in top_pos_tags.items():
    print(f"\nTop 5 POS Tags for '{label}':")
    for pos, count in pos_tags:
        print(f"{pos}: {count}")

# Additional statistics (e.g., mean number of each POS tag per label)
def calculate_mean_pos_per_label(data):
    label_pos_counts = {label: sum_counters(group) for label, group in data.groupby('label')['pos_counts']}
    pos_tag_stats = pd.DataFrame([dict(counter) for counter in label_pos_counts.values() if isinstance(counter, Counter)]).fillna(0).mean()
    return pos_tag_stats

pos_tag_stats = calculate_mean_pos_per_label(data)

print("\nMean number of each POS tag per label:")
print(pos_tag_stats)


Overall Most Common POS Tags:
[('NN', 2997), ('IN', 2695), ('DT', 2307), ('JJ', 2307), ('NNS', 2281), ('VBD', 1296), ('VBN', 1284), ('VBZ', 1251), ('VB', 1072), ('TO', 1002)]

Top 5 POS Tags for 'Conflicting Evidence/Cherrypicking':
[('NN', 765), ('IN', 381), ('NNS', 295), ('JJ', 277), ('DT', 256)]

Top 5 POS Tags for 'Not Enough Evidence':
[('NN', 1188), ('IN', 724), ('NNS', 439), ('JJ', 434), ('DT', 425)]

Top 5 POS Tags for 'Refuted':
[('NN', 7578), ('IN', 3774), ('JJ', 2690), ('DT', 2615), ('NNS', 2357)]

Top 5 POS Tags for 'Supported':
[('NN', 3407), ('IN', 2066), ('JJ', 1372), ('DT', 1236), ('NNS', 1236)]

Top 5 POS Tags for 'Conflicting Evidence/Cherrypicking':
NN: 765
IN: 381
NNS: 295
JJ: 277
DT: 256

Top 5 POS Tags for 'Not Enough Evidence':
NN: 1188
IN: 724
NNS: 439
JJ: 434
DT: 425

Top 5 POS Tags for 'Refuted':
NN: 7578
IN: 3774
JJ: 2690
DT: 2615
NNS: 2357

Top 5 POS Tags for 'Supported':
NN: 3407
IN: 2066
JJ: 1372
DT: 1236
NNS: 1236

Mean number of each POS tag per label:
J

## Chi ^2

In [131]:
from scipy.stats import chi2_contingency
import numpy as np

# Create a contingency table
def create_contingency_table(data, pos_tag):
    table = pd.crosstab(data['label'], data['pos_counts'].apply(lambda x: x.get(pos_tag, 0) > 0))
    return table

# Perform Chi-Square test for each POS tag
chi2_results = {}
for pos_tag in overall_pos_freq.keys():
    contingency_table = create_contingency_table(data, pos_tag)
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    chi2_results[pos_tag] = (chi2, p)

# Get the POS tags with the lowest p-values
sorted_chi2_results = sorted(chi2_results.items(), key=lambda item: item[1][1])
print("Top POS tags by Chi-Square test:")
for pos_tag, (chi2, p) in sorted_chi2_results[:10]:
    print(f"POS Tag: {pos_tag}, Chi2: {chi2}, p-value: {p}")


Top POS tags by Chi-Square test:
POS Tag: CD, Chi2: 55.67679084161057, p-value: 4.923975397460288e-12
POS Tag: VB, Chi2: 26.434160091168728, p-value: 7.735751751849582e-06
POS Tag: MD, Chi2: 24.82168425892936, p-value: 1.6824400398383774e-05
POS Tag: JJR, Chi2: 24.294370481646954, p-value: 2.1683108341069872e-05
POS Tag: NNS, Chi2: 22.43599054320315, p-value: 5.293002923054079e-05
POS Tag: IN, Chi2: 15.81278952091885, p-value: 0.0012387295412777974
POS Tag: RBR, Chi2: 14.960175648701716, p-value: 0.0018509998096514445
POS Tag: WP, Chi2: 10.869356367762885, p-value: 0.012453703243457464
POS Tag: CC, Chi2: 9.940817523178055, p-value: 0.01907596552265242
POS Tag: VBN, Chi2: 9.706298818815164, p-value: 0.021234996947359988


## Mutual Information

In [132]:
from sklearn.feature_selection import mutual_info_classif

# Create a DataFrame where each column is a POS tag and each row is a claim
pos_tag_df = pd.DataFrame([{pos: count for pos, count in counter.items()} for counter in data['pos_counts']]).fillna(0)

# Encode labels
label_encoder = {label: idx for idx, label in enumerate(data['label'].unique())}
y = data['label'].map(label_encoder)

# Calculate mutual information
mi = mutual_info_classif(pos_tag_df, y, discrete_features=True)

# Get the top POS tags by mutual information
mi_scores = pd.Series(mi, index=pos_tag_df.columns)
mi_scores = mi_scores.sort_values(ascending=False)
print("Top POS tags by Mutual Information:")
print(mi_scores.head(10))


Top POS tags by Mutual Information:
CD     0.011774
IN     0.011712
NN     0.010345
NNS    0.009498
VB     0.008839
CC     0.006807
DT     0.006082
MD     0.006080
PRP    0.005993
JJ     0.005534
dtype: float64


## Classifier

In [133]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

# Prepare the dataset
X = pos_tag_df
y = data['label']

# Binarize the labels for one-vs-rest classification
lb = LabelBinarizer()
y_bin = lb.fit_transform(y)

# Function to train a classifier for each label
def train_classifier_per_label(X, y_bin, label_names):
    feature_importance = {}
    for i, label in enumerate(label_names):
        print(f"Training classifier for label: {label}")
        y_label = y_bin[:, i]
        X_train, X_test, y_train, y_test = train_test_split(X, y_label, test_size=0.2, random_state=42)
        
        model = LogisticRegression(max_iter=1000)
        model.fit(X_train, y_train)
        
        # Get the coefficients
        coef = pd.Series(model.coef_[0], index=X.columns)
        feature_importance[label] = coef.sort_values(ascending=False)
        
    return feature_importance

# Train classifiers and get feature importance
label_names = lb.classes_
feature_importance = train_classifier_per_label(X, y_bin, label_names)

# Display the top 5 POS tags by coefficients for each label
for label, coef in feature_importance.items():
    print(f"\nTop 5 POS tags for label '{label}':")
    print(coef.head(5))
    print(f"\nBottom 5 POS tags for label '{label}':")
    print(coef.tail(5))


Training classifier for label: Conflicting Evidence/Cherrypicking
Training classifier for label: Not Enough Evidence
Training classifier for label: Refuted
Training classifier for label: Supported

Top 5 POS tags for label 'Conflicting Evidence/Cherrypicking':
JJR     0.415546
RP      0.246184
NNS     0.217197
PRP$    0.206919
CC      0.195953
dtype: float64

Bottom 5 POS tags for label 'Conflicting Evidence/Cherrypicking':
VBN   -0.360008
WRB   -0.390771
EX    -0.557441
PDT   -0.748603
JJS   -1.084974
dtype: float64

Top 5 POS tags for label 'Not Enough Evidence':
NNPS    1.254270
PDT     0.679914
WRB     0.461062
RBR     0.401490
CC      0.392261
dtype: float64

Bottom 5 POS tags for label 'Not Enough Evidence':
JJ     -0.118838
PRP$   -0.199226
FW     -0.266749
PRP    -0.358378
NNP    -0.715390
dtype: float64

Top 5 POS tags for label 'Refuted':
FW     1.218443
EX     0.311199
WP     0.278311
WP$    0.245163
RBS    0.227018
dtype: float64

Bottom 5 POS tags for label 'Refuted':
TO  

In [134]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report


# Load the dataset
file_path = 'my_env/data/train.json'  # Change to your path
data = pd.read_json(file_path)

# Preprocess the text to make it easier to extract words
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    return tokens

# Apply preprocessing
data['tokens'] = data['claim'].apply(preprocess_text)

# Function to extract POS tags
def extract_pos_tags(tokens):
    return nltk.pos_tag(tokens)

data['pos_tags'] = data['tokens'].apply(extract_pos_tags)

# Function to count POS tags in each claim
def count_pos_tags(pos_tags):
    pos_counts = Counter(pos for word, pos in pos_tags)
    return pos_counts

# Apply function to count POS tags
data['pos_counts'] = data['pos_tags'].apply(count_pos_tags)

# Debugging: Verify that 'pos_counts' column exists and is correctly populated
print("\nSample of 'pos_counts':")
print(data['pos_counts'].head())

# Flatten all POS tags into a single list for overall frequency
all_pos_tags = [pos for pos_counts in data['pos_counts'] for pos in pos_counts]
overall_pos_freq = Counter(all_pos_tags)

# Display the overall most common POS tags
print("\nOverall Most Common POS Tags:")
print(overall_pos_freq.most_common(10))

# Aggregate results by label
def sum_counters(counters):
    sum_counter = Counter()
    for counter in counters:
        if isinstance(counter, Counter):  # Ensure that we only update with Counter objects
            sum_counter.update(counter)
    return sum_counter

# Apply sum_counters function to ensure proper aggregation
label_pos_counts = data.groupby('label')['pos_counts'].apply(lambda x: sum_counters(x))

# Ensure each entry in label_pos_counts is a Counter object
label_pos_counts = {label: sum_counters(group) for label, group in data.groupby('label')['pos_counts']}

# Display the top 5 POS tags per label
for label, pos_counts in label_pos_counts.items():
    print(f"\nTop 5 POS Tags for '{label}':")
    if isinstance(pos_counts, Counter):
        print(pos_counts.most_common(5))
    else:
        print("No POS counts available for this label.")

# POS tags to consider based on the results (top 5 and bottom 5 for each label)
pos_tags_considered = {
    'Conflicting Evidence/Cherrypicking': ['JJR', 'RP', 'NNS', 'PRP$', 'CC', 'VBN', 'WRB', 'EX', 'PDT', 'JJS'],
    'Not Enough Evidence': ['NNPS', 'PDT', 'WRB', 'RBR', 'CC', 'JJ', 'PRP$', 'FW', 'PRP', 'NNP'],
    'Refuted': ['FW', 'EX', 'WP', 'WP$', 'RBS', 'TO', 'WDT', 'CD', 'JJR', 'RBR'],
    'Supported': ['RBR', 'JJR', 'TO', 'NNP', 'CD', 'RBS', 'EX', 'WP', 'NNPS', 'FW']
}

# Coefficients for each label
coefficients = {
    'Conflicting Evidence/Cherrypicking': {'JJR': 0.415546, 'RP': 0.246184, 'NNS': 0.217197, 'PRP$': 0.206919, 'CC': 0.195953, 'VBN': -0.360008, 'WRB': -0.390771, 'EX': -0.557441, 'PDT': -0.748603, 'JJS': -1.084974},
    'Not Enough Evidence': {'NNPS': 1.254270, 'PDT': 0.679914, 'WRB': 0.461062, 'RBR': 0.401490, 'CC': 0.392261, 'JJ': -0.118838, 'PRP$': -0.199226, 'FW': -0.266749, 'PRP': -0.358378, 'NNP': -0.715390},
    'Refuted': {'FW': 1.218443, 'EX': 0.311199, 'WP': 0.278311, 'WP$': 0.245163, 'RBS': 0.227018, 'TO': -0.271341, 'WDT': -0.298864, 'CD': -0.357227, 'JJR': -0.432570, 'RBR': -0.620916},
    'Supported': {'RBR': 0.399382, 'JJR': 0.359911, 'TO': 0.314293, 'NNP': 0.290118, 'CD': 0.228353, 'RBS': -0.326874, 'EX': -0.444007, 'WP': -0.446281, 'NNPS': -0.863679, 'FW': -0.953218}
}

# Function to classify a single claim
def classify_claim(pos_counts):
    scores = {label: 0.0 for label in coefficients.keys()}
    for label in coefficients.keys():
        for pos_tag, coef in coefficients[label].items():
            scores[label] += coef * pos_counts.get(pos_tag, 0)
    return max(scores, key=scores.get)

# Apply classification to each claim
data['predicted_label'] = data['pos_counts'].apply(classify_claim)

# Evaluate the classification
print("\nClassification Report:")
print(classification_report(data['label'], data['predicted_label']))



Sample of 'pos_counts':
0    {'NN': 8, 'VBD': 2, 'DT': 3, 'IN': 3, 'CC': 1,...
1    {'JJ': 2, 'NN': 3, 'VBD': 1, 'DT': 1, 'JJS': 1...
2    {'IN': 3, 'NNS': 2, 'NN': 3, 'VBZ': 1, 'TO': 1...
3    {'NN': 7, 'VBZ': 1, 'VBN': 1, 'TO': 2, 'VB': 2...
4    {'IN': 3, 'DT': 1, 'NN': 4, 'NNS': 3, 'VBP': 2...
Name: pos_counts, dtype: object

Overall Most Common POS Tags:
[('NN', 2997), ('IN', 2695), ('DT', 2307), ('JJ', 2307), ('NNS', 2281), ('VBD', 1296), ('VBN', 1284), ('VBZ', 1251), ('VB', 1072), ('TO', 1002)]

Top 5 POS Tags for 'Conflicting Evidence/Cherrypicking':
[('NN', 765), ('IN', 381), ('NNS', 295), ('JJ', 277), ('DT', 256)]

Top 5 POS Tags for 'Not Enough Evidence':
[('NN', 1188), ('IN', 724), ('NNS', 439), ('JJ', 434), ('DT', 425)]

Top 5 POS Tags for 'Refuted':
[('NN', 7578), ('IN', 3774), ('JJ', 2690), ('DT', 2615), ('NNS', 2357)]

Top 5 POS Tags for 'Supported':
[('NN', 3407), ('IN', 2066), ('JJ', 1372), ('DT', 1236), ('NNS', 1236)]

Classification Report:
                        

In [135]:
from sklearn.ensemble import RandomForestClassifier

# Prepare the dataset
X = pos_tag_df
y = data['label'].map(label_encoder)

# Train a RandomForest classifier
model = RandomForestClassifier(n_estimators=1000, random_state=42)
model.fit(X, y)

# Get feature importances
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
filtered_pos_tags = feature_importances[feature_importances > 0.001].index.tolist()  # Keep only features with importance > 0.01
print(f"Filtered POS Tags: {filtered_pos_tags}")

# Update coefficients and pos_tags_considered with filtered POS tags
coefficients_filtered = {label: {pos_tag: coef for pos_tag, coef in coefs.items() if pos_tag in filtered_pos_tags} for label, coefs in coefficients.items()}
pos_tags_considered_filtered = {label: [pos_tag for pos_tag in pos_tags if pos_tag in filtered_pos_tags] for label, pos_tags in pos_tags_considered.items()}


Filtered POS Tags: ['NN', 'VBD', 'DT', 'IN', 'CC', 'WRB', 'PRP', 'JJ', 'JJS', 'NNS', 'VBZ', 'TO', 'VBN', 'VB', 'VBP', 'RB', 'MD', 'WP', 'PRP$', 'RP', 'RBR', 'JJR', 'EX', 'CD', 'VBG', 'WDT', 'RBS', 'PDT', 'NNPS', 'NNP']


In [136]:
# Function to classify a single claim using filtered POS tags
def classify_claim_filtered(pos_counts):
    scores = {label: 0.0 for label in coefficients_filtered.keys()}
    for label in coefficients_filtered.keys():
        for pos_tag, coef in coefficients_filtered[label].items():
            scores[label] += coef * pos_counts.get(pos_tag, 0)
    return max(scores, key=scores.get)

# Apply classification to each claim
data['predicted_label_filtered'] = data['pos_counts'].apply(classify_claim_filtered)

# Evaluate the classification
print("\nClassification Report (Filtered POS Tags):")
print(classification_report(data['label'], data['predicted_label_filtered']))



Classification Report (Filtered POS Tags):
                                    precision    recall  f1-score   support

Conflicting Evidence/Cherrypicking       0.07      0.57      0.13       195
               Not Enough Evidence       0.12      0.12      0.12       282
                           Refuted       0.64      0.13      0.22      1742
                         Supported       0.34      0.36      0.35       849

                          accuracy                           0.22      3068
                         macro avg       0.29      0.30      0.20      3068
                      weighted avg       0.47      0.22      0.24      3068



In [137]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Load the dataset
file_path = 'my_env/data/train.json'  # Change to your path
data = pd.read_json(file_path)

# Filter dataset to include only two labels
data = data[data['label'].isin(['Conflicting Evidence/Cherrypicking', 'Not Enough Evidence'])]

# Preprocess the text to make it easier to extract words
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    return tokens

# Apply preprocessing
data['tokens'] = data['claim'].apply(preprocess_text)

# Function to extract POS tags
def extract_pos_tags(tokens):
    return nltk.pos_tag(tokens)

data['pos_tags'] = data['tokens'].apply(extract_pos_tags)

# Function to count POS tags in each claim
def count_pos_tags(pos_tags):
    pos_counts = Counter(pos for word, pos in pos_tags)
    return pos_counts

# Apply function to count POS tags
data['pos_counts'] = data['pos_tags'].apply(count_pos_tags)

# Debugging: Verify that 'pos_counts' column exists and is correctly populated
print("\nSample of 'pos_counts':")
print(data['pos_counts'].head())

# Prepare the dataset for classifier
pos_tag_df = pd.DataFrame([{pos: count for pos, count in counter.items()} for counter in data['pos_counts']]).fillna(0)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['label'])

# Train a RandomForest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(pos_tag_df, y)

# Get feature importances
feature_importances = pd.Series(model.feature_importances_, index=pos_tag_df.columns)
filtered_pos_tags = feature_importances[feature_importances > 0.001].index.tolist()  # Keep only features with importance > 0.01
print(f"Filtered POS Tags: {filtered_pos_tags}")

# Update pos_counts to include only filtered POS tags
def filter_pos_counts(pos_counts, filtered_pos_tags):
    return {pos: count for pos, count in pos_counts.items() if pos in filtered_pos_tags}

data['filtered_pos_counts'] = data['pos_counts'].apply(lambda x: filter_pos_counts(x, filtered_pos_tags))

# Function to classify a single claim using filtered POS tags
coefficients_filtered = {
    'Conflicting Evidence/Cherrypicking': {'JJR': 0.415546, 'RP': 0.246184, 'NNS': 0.217197, 'PRP$': 0.206919, 'CC': 0.195953, 'VBN': -0.360008, 'WRB': -0.390771, 'EX': -0.557441, 'PDT': -0.748603, 'JJS': -1.084974},
    'Not Enough Evidence': {'NNPS': 1.254270, 'PDT': 0.679914, 'WRB': 0.461062, 'RBR': 0.401490, 'CC': 0.392261, 'JJ': -0.118838, 'PRP$': -0.199226, 'FW': -0.266749, 'PRP': -0.358378, 'NNP': -0.715390}
}

def classify_claim_filtered(pos_counts):
    scores = {label: 0.0 for label in coefficients_filtered.keys()}
    for label in coefficients_filtered.keys():
        for pos_tag, coef in coefficients_filtered[label].items():
            scores[label] += coef * pos_counts.get(pos_tag, 0)
    return max(scores, key=scores.get)

# Apply classification to each claim
data['predicted_label_filtered'] = data['filtered_pos_counts'].apply(classify_claim_filtered)

# Evaluate the classification
print("\nClassification Report (Filtered POS Tags):")
print(classification_report(data['label'], data['predicted_label_filtered']))



Sample of 'pos_counts':
7     {'JJ': 1, 'NN': 5, 'VBD': 4, 'DT': 1, 'WP': 1,...
8     {'NN': 5, 'VBD': 2, 'IN': 2, 'DT': 1, 'JJ': 2,...
12    {'JJ': 2, 'NN': 2, 'VBD': 1, 'DT': 1, 'CC': 1,...
18    {'DT': 1, 'JJR': 1, 'NN': 4, 'IN': 4, 'NNS': 5...
19    {'NN': 2, 'VBZ': 1, 'VBN': 1, 'PRP$': 1, 'NNS'...
Name: pos_counts, dtype: object
Filtered POS Tags: ['JJ', 'NN', 'VBD', 'DT', 'WP', 'IN', 'NNS', 'PRP$', 'WRB', 'TO', 'VB', 'CC', 'RB', 'PRP', 'JJR', 'VBP', 'VBZ', 'VBN', 'EX', 'CD', 'MD', 'VBG', 'RP', 'WDT', 'RBR', 'JJS', 'PDT', 'FW', 'RBS', 'NNPS', 'NNP']

Classification Report (Filtered POS Tags):
                                    precision    recall  f1-score   support

Conflicting Evidence/Cherrypicking       0.45      0.81      0.58       195
               Not Enough Evidence       0.70      0.32      0.44       282

                          accuracy                           0.52       477
                         macro avg       0.57      0.56      0.51       477
            

In [150]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report


# Load the dataset
file_path = 'my_env/data/train.json'  # Change to your path
data = pd.read_json(file_path)

# Preprocess the text to make it easier to extract words
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    return tokens

# Apply preprocessing
data['tokens'] = data['claim'].apply(preprocess_text)

# Function to extract POS tags
def extract_pos_tags(tokens):
    return nltk.pos_tag(tokens)

data['pos_tags'] = data['tokens'].apply(extract_pos_tags)

# Function to count POS tags in each claim
def count_pos_tags(pos_tags):
    pos_counts = Counter(pos for word, pos in pos_tags)
    return pos_counts

data['pos_counts'] = data['pos_tags'].apply(count_pos_tags)

# Function to extract POS n-grams
def extract_pos_ngrams(pos_tags, n):
    pos_sequence = [pos for word, pos in pos_tags]
    ngrams_list = list(ngrams(pos_sequence, n))
    return Counter(ngrams_list)

data['pos_bigrams'] = data['pos_tags'].apply(lambda x: extract_pos_ngrams(x, 2))
data['pos_trigrams'] = data['pos_tags'].apply(lambda x: extract_pos_ngrams(x, 3))

# Function to calculate POS ratios
def calculate_pos_ratios(pos_counts):
    total = sum(pos_counts.values())
    ratios = {pos: count / total for pos, count in pos_counts.items()}
    return ratios

data['pos_ratios'] = data['pos_counts'].apply(calculate_pos_ratios)

# Create a combined feature DataFrame
pos_counts_df = pd.DataFrame(data['pos_counts'].tolist()).fillna(0)
pos_bigrams_df = pd.DataFrame(data['pos_bigrams'].tolist()).fillna(0)
pos_trigrams_df = pd.DataFrame(data['pos_trigrams'].tolist()).fillna(0)
pos_ratios_df = pd.DataFrame(data['pos_ratios'].tolist()).fillna(0)

# Convert tuple column names to strings
pos_bigrams_df.columns = pos_bigrams_df.columns.map(str)
pos_trigrams_df.columns = pos_trigrams_df.columns.map(str)

features_df = pd.concat([pos_counts_df, pos_bigrams_df, pos_trigrams_df, pos_ratios_df], axis=1)

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['label'])
target_names = label_encoder.classes_

# Train a logistic regression model
X_train, X_test, y_train, y_test = train_test_split(features_df, y, test_size=0.3, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("\nClassification Report with Advanced Features:")
print(classification_report(y_test, y_pred, target_names=target_names))



Classification Report with Advanced Features:
                                    precision    recall  f1-score   support

Conflicting Evidence/Cherrypicking       0.06      0.02      0.02        65
               Not Enough Evidence       0.14      0.10      0.11        63
                           Refuted       0.66      0.75      0.71       555
                         Supported       0.41      0.40      0.41       238

                          accuracy                           0.57       921
                         macro avg       0.32      0.32      0.31       921
                      weighted avg       0.52      0.57      0.54       921



In [151]:
!pip install imbalanced-learn


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.3-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.12.3-py3-none-any.whl (258 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.3/258.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.3


In [152]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE


# Load the dataset
file_path = 'my_env/data/train.json'  # Change to your path
data = pd.read_json(file_path)

# Preprocess the text to make it easier to extract words
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    return tokens

# Apply preprocessing
data['tokens'] = data['claim'].apply(preprocess_text)

# Function to extract POS tags
def extract_pos_tags(tokens):
    return nltk.pos_tag(tokens)

data['pos_tags'] = data['tokens'].apply(extract_pos_tags)

# Function to count POS tags in each claim
def count_pos_tags(pos_tags):
    pos_counts = Counter(pos for word, pos in pos_tags)
    return pos_counts

data['pos_counts'] = data['pos_tags'].apply(count_pos_tags)

# Function to extract POS n-grams
def extract_pos_ngrams(pos_tags, n):
    pos_sequence = [pos for word, pos in pos_tags]
    ngrams_list = list(ngrams(pos_sequence, n))
    return Counter(ngrams_list)

data['pos_bigrams'] = data['pos_tags'].apply(lambda x: extract_pos_ngrams(x, 2))
data['pos_trigrams'] = data['pos_tags'].apply(lambda x: extract_pos_ngrams(x, 3))

# Function to calculate POS ratios
def calculate_pos_ratios(pos_counts):
    total = sum(pos_counts.values())
    ratios = {pos: count / total for pos, count in pos_counts.items()}
    return ratios

data['pos_ratios'] = data['pos_counts'].apply(calculate_pos_ratios)

# Create a combined feature DataFrame
pos_counts_df = pd.DataFrame(data['pos_counts'].tolist()).fillna(0)
pos_bigrams_df = pd.DataFrame(data['pos_bigrams'].tolist()).fillna(0)
pos_trigrams_df = pd.DataFrame(data['pos_trigrams'].tolist()).fillna(0)
pos_ratios_df = pd.DataFrame(data['pos_ratios'].tolist()).fillna(0)

# Convert tuple column names to strings
pos_bigrams_df.columns = pos_bigrams_df.columns.map(str)
pos_trigrams_df.columns = pos_trigrams_df.columns.map(str)

features_df = pd.concat([pos_counts_df, pos_bigrams_df, pos_trigrams_df, pos_ratios_df], axis=1)

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['label'])
target_names = label_encoder.classes_

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features_df, y, test_size=0.3, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train a logistic regression model with class weights
class_weights = dict(zip(range(len(target_names)), [1.0] * len(target_names)))
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train_resampled, y_train_resampled)

# Evaluate the model
y_pred = model.predict(X_test)
print("\nClassification Report with Advanced Features and SMOTE:")
print(classification_report(y_test, y_pred, target_names=target_names))



Classification Report with Advanced Features and SMOTE:
                                    precision    recall  f1-score   support

Conflicting Evidence/Cherrypicking       0.08      0.09      0.08        65
               Not Enough Evidence       0.11      0.21      0.15        63
                           Refuted       0.69      0.56      0.62       555
                         Supported       0.38      0.44      0.41       238

                          accuracy                           0.47       921
                         macro avg       0.31      0.33      0.31       921
                      weighted avg       0.53      0.47      0.49       921



In [162]:
!pip install scipy gensim




## WORD2VEC

In [166]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import gensim
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import numpy as np


# Load the dataset
file_path = 'my_env/data/train.json'  # Change to your path
data = pd.read_json(file_path)

# Preprocess the text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    return tokens

data['tokens'] = data['claim'].apply(preprocess_text)

# Train a Word2Vec model on the tokenized claims
sentences = data['tokens'].tolist()
word2vec_model = gensim.models.Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4, seed=42)

# Function to convert a list of tokens to a Word2Vec vector
def tokens_to_word2vec(tokens, model, vector_size):
    vec = np.zeros(vector_size)
    count = 0
    for token in tokens:
        if token in model.wv:
            vec += model.wv[token]
            count += 1
    if count > 0:
        vec /= count
    return vec

# Apply the function to convert each claim to a Word2Vec vector
vector_size = 100
data['word2vec'] = data['tokens'].apply(lambda x: tokens_to_word2vec(x, word2vec_model, vector_size))

# Create a feature DataFrame from the Word2Vec vectors
features_df = pd.DataFrame(data['word2vec'].tolist())

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['label'])
target_names = label_encoder.classes_

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features_df, y, test_size=0.3, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train a logistic regression model with class weights
class_weights = dict(zip(range(len(target_names)), [1.0] * len(target_names)))
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train_resampled, y_train_resampled)

# Evaluate the model
y_pred = model.predict(X_test)
print("\nClassification Report with Word2Vec and SMOTE:")
print(classification_report(y_test, y_pred, target_names=target_names))


ImportError: cannot import name 'triu' from 'scipy.linalg' (/home/aaronbry/my_env/lib/python3.11/site-packages/scipy/linalg/__init__.py)

## Find best combination of POS *ignore & all below

In [138]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


# Load the dataset
file_path = '/home/aaronbry/my_env/data/train.json'  # Change to your path
data = pd.read_json(file_path)

# Preprocess the text
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token.isalpha()]
    return tokens

data['tokens'] = data['claim'].apply(preprocess_text)


In [139]:
# Function to extract POS tags
def extract_pos_tags(tokens):
    return nltk.pos_tag(tokens)

data['pos_tags'] = data['tokens'].apply(extract_pos_tags)

# Define combinations of POS tags
pos_combinations = [
    ['JJ', 'JJR', 'JJS'],  # Adjectives
    ['RB', 'RBR', 'RBS'],  # Adverbs
    ['MD'],  # Modals
    ['UH'],  # Interjections
    ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS', 'MD', 'UH']  # All combined
]

# Function to count specific POS tags in tokens
def count_pos_tags(pos_tags, pos_list):
    return sum(1 for word, pos in pos_tags if pos in pos_list)

# Create feature sets for each combination
for pos_comb in pos_combinations:
    feature_name = '_'.join(pos_comb) + '_count'
    data[feature_name] = data['pos_tags'].apply(lambda pos_tags: count_pos_tags(pos_tags, pos_comb))


In [140]:
# Prepare the dataset for training
X = data[[('_'.join(pos_comb) + '_count') for pos_comb in pos_combinations]]
y = data['label'].apply(lambda x: 1 if x in ['Conflicting Evidence/Cherrypicking', 'Not Enough Evidence'] else 0)  # Example labeling

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate models
results = {}
for pos_comb in pos_combinations:
    feature_name = '_'.join(pos_comb) + '_count'
    model = LogisticRegression()
    model.fit(X_train[[feature_name]], y_train)
    y_pred = model.predict(X_test[[feature_name]])
    accuracy = accuracy_score(y_test, y_pred)
    results[feature_name] = accuracy
    print(f"Features: {feature_name}, Accuracy: {accuracy}")


Features: JJ_JJR_JJS_count, Accuracy: 0.8566775244299675
Features: RB_RBR_RBS_count, Accuracy: 0.8566775244299675
Features: MD_count, Accuracy: 0.8566775244299675
Features: UH_count, Accuracy: 0.8566775244299675
Features: JJ_JJR_JJS_RB_RBR_RBS_MD_UH_count, Accuracy: 0.8566775244299675


In [141]:
# Display the results
best_combination = max(results, key=results.get)
print(f"Best POS combination: {best_combination}, Accuracy: {results[best_combination]}")

# Detailed classification report for the best combination
best_model = LogisticRegression()
best_model.fit(X_train[[best_combination]], y_train)
y_best_pred = best_model.predict(X_test[[best_combination]])
print(classification_report(y_test, y_best_pred))


Best POS combination: JJ_JJR_JJS_count, Accuracy: 0.8566775244299675
              precision    recall  f1-score   support

           0       0.86      1.00      0.92       526
           1       0.00      0.00      0.00        88

    accuracy                           0.86       614
   macro avg       0.43      0.50      0.46       614
weighted avg       0.73      0.86      0.79       614



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [142]:
from sklearn.utils import resample
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler


# Load the dataset
file_path = '/home/aaronbry/my_env/data/dev.json'  # Change to your path
data = pd.read_json(file_path)

# Preprocess the text
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token.isalpha()]
    return tokens

data['tokens'] = data['claim'].apply(preprocess_text)

# Separate majority and minority classes
df_majority = data[data['label'].isin(['Refuted', 'Supported'])]
df_minority = data[data['label'].isin(['Conflicting Evidence/Cherrypicking', 'Not Enough Evidence'])]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,    # sample with replacement
                                 n_samples=len(df_majority),  # to match majority class
                                 random_state=42)  # reproducible results

# Combine majority class with upsampled minority class
data_balanced = pd.concat([df_majority, df_minority_upsampled])

# Display new class counts
print(data_balanced['label'].value_counts())

# Function to extract POS tags
def extract_pos_tags(tokens):
    return nltk.pos_tag(tokens)

data_balanced['pos_tags'] = data_balanced['tokens'].apply(extract_pos_tags)

# Define combinations of POS tags
pos_combinations = [
    ['JJ', 'JJR', 'JJS'],  # Adjectives
    ['RB', 'RBR', 'RBS'],  # Adverbs
    ['MD'],  # Modals
    ['UH'],  # Interjections
    ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS', 'MD', 'UH']  # All combined
]

# Function to count specific POS tags in tokens
def count_pos_tags(pos_tags, pos_list):
    return sum(1 for word, pos in pos_tags if pos in pos_list)

# Create feature sets for each combination
for pos_comb in pos_combinations:
    feature_name = '_'.join(pos_comb) + '_count'
    data_balanced[feature_name] = data_balanced['pos_tags'].apply(lambda pos_tags: count_pos_tags(pos_tags, pos_comb))

# Prepare the dataset for training
X = data_balanced[[('_'.join(pos_comb) + '_count') for pos_comb in pos_combinations]]
y = data_balanced['label'].apply(lambda x: 1 if x in ['Conflicting Evidence/Cherrypicking', 'Not Enough Evidence'] else 0)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate models with class weights
results = {}
for pos_comb in pos_combinations:
    feature_name = '_'.join(pos_comb) + '_count'
    model = LogisticRegression(class_weight='balanced')
    model.fit(X_train[[feature_name]], y_train)
    y_pred = model.predict(X_test[[feature_name]])
    accuracy = accuracy_score(y_test, y_pred)
    results[feature_name] = accuracy
    print(f"Features: {feature_name}, Accuracy: {accuracy}")

# Check feature distribution
print(data_balanced[['JJ_JJR_JJS_count', 'RB_RBR_RBS_count', 'MD_count', 'UH_count', 'JJ_JJR_JJS_RB_RBR_RBS_MD_UH_count']].describe())

# Combine all features
X_combined = data_balanced[['JJ_JJR_JJS_count', 'RB_RBR_RBS_count', 'MD_count', 'UH_count', 'JJ_JJR_JJS_RB_RBR_RBS_MD_UH_count']]
X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Train a model on combined features with class weights
combined_model = LogisticRegression(class_weight='balanced')
combined_model.fit(X_train_combined, y_train_combined)
y_combined_pred = combined_model.predict(X_test_combined)
combined_accuracy = accuracy_score(y_test_combined, y_combined_pred)
print(f"Combined Features Accuracy: {combined_accuracy}")

# Detailed classification report for the combined features
print(classification_report(y_test_combined, y_combined_pred))

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_combined)
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train a model on scaled features with class weights
scaled_model = LogisticRegression(class_weight='balanced')
scaled_model.fit(X_train_scaled, y_train_scaled)
y_scaled_pred = scaled_model.predict(X_test_scaled)
scaled_accuracy = accuracy_score(y_test_scaled, y_scaled_pred)
print(f"Scaled Features Accuracy: {scaled_accuracy}")

# Detailed classification report for the scaled features
print(classification_report(y_test_scaled, y_scaled_pred))


label
Refuted                               305
Conflicting Evidence/Cherrypicking    216
Not Enough Evidence                   211
Supported                             122
Name: count, dtype: int64
Features: JJ_JJR_JJS_count, Accuracy: 0.4678362573099415
Features: RB_RBR_RBS_count, Accuracy: 0.5380116959064327
Features: MD_count, Accuracy: 0.49707602339181284
Features: UH_count, Accuracy: 0.5146198830409356
Features: JJ_JJR_JJS_RB_RBR_RBS_MD_UH_count, Accuracy: 0.4853801169590643
       JJ_JJR_JJS_count  RB_RBR_RBS_count    MD_count  UH_count  \
count        854.000000        854.000000  854.000000     854.0   
mean           1.820843          0.615925    0.160422       0.0   
std            1.673036          1.241158    0.415161       0.0   
min            0.000000          0.000000    0.000000       0.0   
25%            1.000000          0.000000    0.000000       0.0   
50%            2.000000          0.000000    0.000000       0.0   
75%            3.000000          1.000000   

In [143]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

# Load the dataset
file_path = '/home/aaronbry/my_env/data/train.json'  # Change to your path
data = pd.read_json(file_path)

# Preprocess the text to make it easier to extract words
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [token for token in tokens if token.isalpha()]  # Remove non-alphabetic tokens
    return tokens

# Apply preprocessing
data['tokens'] = data['claim'].apply(preprocess_text)

# Function to extract POS tags
def extract_pos_tags(tokens):
    return nltk.pos_tag(tokens)

data['pos_tags'] = data['tokens'].apply(extract_pos_tags)

# Filter claims by specific labels
conflicting_claims = data[data['label'] == 'Refuted']

# Relevant POS tags typically associated with opinionated statements
relevant_pos_tags = ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS', 'MD', 'UH']

# Function to filter and count relevant POS tags
def filter_relevant_pos_tags(pos_tags, relevant_pos_tags):
    return [pos for word, pos in pos_tags if pos in relevant_pos_tags]

# Apply filtering to get relevant POS tags for conflicting claims
conflicting_relevant_pos_tags = [pos for tokens in conflicting_claims['pos_tags'] for pos in filter_relevant_pos_tags(tokens, relevant_pos_tags)]

# Frequency distribution of relevant POS tags
conflicting_relevant_pos_freq = Counter(conflicting_relevant_pos_tags)

# Get the most common relevant POS tags
most_common_conflicting_relevant_pos = conflicting_relevant_pos_freq.most_common(20)

# Convert to DataFrame for better readability
most_common_conflicting_relevant_pos_df = pd.DataFrame(most_common_conflicting_relevant_pos, columns=['POS Tag', 'Frequency'])

# Display the results
print("Most Common Relevant POS Tags for 'Conflicting Evidence/Cherrypicking':\n", most_common_conflicting_relevant_pos_df)


Most Common Relevant POS Tags for 'Conflicting Evidence/Cherrypicking':
   POS Tag  Frequency
0      JJ       2690
1      RB        701
2      MD        300
3     JJR         87
4     JJS         65
5     RBR         26
6     RBS         17
