In [84]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots



In [16]:
from sklearn.metrics import f1_score, precision_recall_curve
import numpy as np

In [4]:
df_french = pd.read_csv("../location_analysis/data/df_french_with_mentions_sentiment.csv")

df_german = pd.read_csv("../location_analysis/data/df_german_with_mentions_sentiment.csv")


In [85]:
fig = make_subplots(rows=1, cols=2, subplot_titles=('German Headlines Sentiments', 'French Headlines Sentiments'))

# German Headlines Sentiment Plot
german_sentiments = df_german['Sentiment'].value_counts()
fig.add_trace(
    go.Bar(x=german_sentiments.index, y=german_sentiments.values, name='German'),
    row=1, col=1
)

# French Headlines Sentiment Plot
french_sentiments = df_french['Sentiment'].value_counts()
fig.add_trace(
    go.Bar(x=french_sentiments.index, y=french_sentiments.values, name='French'),
    row=1, col=2
)

# Update layout
fig.update_layout(height=600, width=1200, title_text="Sentiment Analysis of Headlines", showlegend=False)
fig.show()


In [87]:
german_counts = df_german['Sentiment'].value_counts().sort_index()
french_counts = df_french['Sentiment'].value_counts().sort_index()

# Creating a grouped bar chart
fig = go.Figure(data=[
    go.Bar(name='German', x=german_counts.index, y=german_counts.values),
    go.Bar(name='French', x=french_counts.index, y=french_counts.values)
])

# Change the bar mode to group
fig.update_layout(
    barmode='group',
    title='Sentiments in German and French Headlines',
    xaxis_title='Sentiment',
    yaxis_title='Count',
    legend_title='Language'
)

fig.show()


In [36]:
from sklearn.metrics import silhouette_score

# Let's create an array of possible thresholds based on the confidence scores
confidence_scores = df_french['Confidence'].values
possible_thresholds = np.unique(confidence_scores)

# Sort the confidence scores for efficient processing
sorted_scores = np.sort(confidence_scores)

# This will store the silhouette scores for each threshold
silhouette_scores = []

# Calculate silhouette scores for each threshold
for threshold in possible_thresholds:
    # Assign cluster labels based on the threshold
    cluster_labels = (sorted_scores >= threshold).astype(int)
    
    # We need at least 2 points in each cluster to calculate the silhouette score
    if len(np.unique(cluster_labels)) == 2:
        # Reshape the scores as silhouette_score expects a 2D array
        scores_reshaped = sorted_scores.reshape(-1, 1)
        score = silhouette_score(scores_reshaped, cluster_labels)
        silhouette_scores.append(score)
    else:
        silhouette_scores.append(-1)  # Invalid score for insufficient clusters

# The optimal threshold corresponds to the highest silhouette score
optimal_threshold_index = np.argmax(silhouette_scores)
optimal_threshold = possible_thresholds[optimal_threshold_index]

print(f'Optimal threshold based on silhouette score: {optimal_threshold}')


Optimal threshold based on silhouette score: 0.7315875887870789


In [49]:
def apply_threshold(confidence_score, original_sentiment, threshold=optimal_threshold):
    if confidence_score < threshold:
        return 'neutral'
    else:
        return original_sentiment

# Assume 'Original Sentiment' is the column with the model's original sentiment prediction
df_french['Sentiment Category'] = df_french.apply(
    lambda row: apply_threshold(row['Confidence'], row['Sentiment']), axis=1
)

**Analysis**

In [64]:
prop_german = df_german['Sentiment'].value_counts(normalize=True)
prop_french = df_french['Sentiment'].value_counts(normalize=True)

df_sentiment_proportions = pd.DataFrame({'German': prop_german, 'French': prop_french}).fillna(0)


In [92]:
fig_comparison = go.Figure(data=[
    go.Bar(name='German', x=df_sentiment_proportions.index, y=df_sentiment_proportions['German']),
    go.Bar(name='French', x=df_sentiment_proportions.index, y=df_sentiment_proportions['French'])
])
fig_comparison.update_layout(barmode='group', 
                             title_text='Sentiment Proportion Comparison Between German and French Headlines', 
                             xaxis_title='Sentiment', 
                             yaxis_title='Proportion')
fig_comparison.show()


In [69]:
# Proportion of negative sentiment
neg_german = df_german['Sentiment'].value_counts(normalize=True).get('negative', 0)
neg_french = df_french['Sentiment'].value_counts(normalize=True).get('negative', 0)

print(f"Proportion of Negative Sentiment in German Headlines: {neg_german}")
print(f"Proportion of Negative Sentiment in French Headlines: {neg_french}")


Proportion of Negative Sentiment in German Headlines: 0.18232662192393737
Proportion of Negative Sentiment in French Headlines: 0.27264150943396226


In [89]:
fig_neg = go.Figure([go.Bar(x=['German', 'French'], y=[neg_german, neg_french], marker_color=['blue', 'red'])])
fig_neg.update_layout(title_text='Negative Sentiment Proportion in German vs. French Headlines', 
                      yaxis_title='Proportion of Negative Sentiment')
fig_neg.show()



In [71]:
from scipy.stats import chi2_contingency
import numpy as np

# Create an array of the counts of negative and non-negative sentiments for both languages
# Assuming you have the same total number of headlines in both datasets for simplicity
total_german = len(df_german)
total_french = len(df_french)

negative_counts = np.array([df_german['Sentiment'].value_counts().get('negative', 0), 
                            df_french['Sentiment'].value_counts().get('negative', 0)])
non_negative_counts = np.array([total_german, total_french]) - negative_counts

# The contingency table
contingency_table = np.array([negative_counts, non_negative_counts])

# Perform the chi-squared test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print(f'Chi-squared test p-value: {p_value}')


Chi-squared test p-value: 3.1183530824954142e-06


`Chi-squared test p-value (3.1184e-06)`: Similarly, the low p-value from the chi-squared test also indicates a significant difference in the sentiment distribution between German and French headlines. Since the chi-squared test compares the observed distribution to an expected distribution (assuming no difference between groups), the result supports the conclusion that the two distributions are not the same.

In [93]:
from scipy.stats import ttest_ind, chi2_contingency

# Encoding sentiments as numerical values
df_german['Sentiment Score'] = df_german['Sentiment'].map({'negative': -1, 'neutral': 0, 'positive': 1})
df_french['Sentiment Score'] = df_french['Sentiment'].map({'negative': -1, 'neutral': 0, 'positive': 1})

# Perform t-test
t_stat, p_val_ttest = ttest_ind(df_german['Sentiment Score'], df_french['Sentiment Score'])

print(f"T-test p-value: {p_val_ttest}")

# Assuming the previous chi-squared test has been conducted, we can also compare proportions using a z-test
from statsmodels.stats.proportion import proportions_ztest

# Get the count of negative sentiments for both languages
count_neg_german = df_german['Sentiment'].value_counts().get('negative', 0)
count_neg_french = df_french['Sentiment'].value_counts().get('negative', 0)

# Perform z-test for proportions
z_stat, p_val_ztest = proportions_ztest([count_neg_german, count_neg_french], [total_german, total_french])

print(f"Z-test for proportions p-value: {p_val_ztest}")


T-test p-value: 0.8394748759359953
Z-test for proportions p-value: 2.396894884464927e-06


`T-test p-value (0.8395)`: This high p-value suggests that there is no significant difference in the average sentiment score between German and French headlines. This means that when sentiments are encoded as numerical scores (-1 for negative, 0 for neutral, and 1 for positive), the mean sentiment does not differ substantially between the two languages. In practice, this test might not be entirely appropriate because sentiment scores are ordinal, not interval data, which violates the assumptions of the t-test.



`Z-test for proportions p-value (2.3969e-06)`: This very low p-value indicates a significant difference in the proportion of negative sentiments between the two sets of headlines. Since the p-value is much less than the standard alpha level of 0.05, you can reject the null hypothesis that the proportion of negative sentiments in German and French headlines is the same. This suggests that one language's headlines have a significantly higher proportion of negative sentiments, which could be interpreted as being more sensationalist.



In [107]:
def assign_sentiment_score(sentiment):
    if sentiment == 'neutral':
        return 0
    elif sentiment == 'positive':
        return 1
    elif sentiment == 'negative':
        return -1
    else:
        return None  # or some default value

# Apply the function to the 'Sentiment' column to create the 'Sentiment Score' column
df_french['Sentiment Score'] = df_french['Sentiment'].apply(assign_sentiment_score)


In [110]:
df_french.head(0)

Unnamed: 0,Title,Header,Content,Mentioned_Countries,Mentioned_Swiss_Cities,Sentiment,Confidence,Original_Sentiment,Sentiment Score


In [111]:
csv_file_path = '../location_analysis/data/df_german_with_mentions_sentiments_.csv'
df_german.to_csv(csv_file_path, index=False)

In [112]:
csv_file_path = '../location_analysis/data/df_french_with_mentions_sentiments_.csv'
df_french.to_csv(csv_file_path, index=False)