# Zero shot classification test

In [1]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")

2023-08-15 14:22:26.641771: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2023-08-15 14:22:29.251027: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-08-15 14:22:29.252157: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2023-08-15 14:22:29.295290: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-15 14:22:29.296263: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:00:1e.0 name: Tesla V100-SXM2-16GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s
2023-08-15 14:22:29.296307: I tensorflow/stream_executor/platform/defau

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
pd.set_option('display.max_columns', None)

## Read in data

In [4]:
#reading in data 
df = pd.read_csv("dep.csv")

In [5]:
#renaming post col to text
df = df.rename(columns = {"post" : "text"})

In [6]:
#Setting label for post
text_labels = ["anticipation", "anger", "disgust", "fear", "joy", "trust"]

In [7]:
text_labels

['anticipation', 'anger', 'disgust', 'fear', 'joy', 'trust']

## Function to classify text and return a df

In [8]:
def classify_text(df):
    result_list = []
    for index, row in df.iterrows():
        sequence_to_classify = row['text']
        result = classifier(sequence_to_classify, text_labels, multi_label = False)
        result['label_1'] = result['labels'][0]
        result['label_2'] = result['labels'][1]
        result['label_3'] = result['labels'][2]
        result['themeScores'] = result['scores'][0:3]
        result_list.append(result)
    result_df = pd.DataFrame(result_list)[['sequence', 'label_1', 'themeScores']]
    return result_df

In [9]:
df.columns

Index(['index', 'subreddit', 'date', 'text', 'covid period', 'gender'], dtype='object')

In [None]:
results_df = classify_text(df)

In [None]:
df = df.merge(results_df, how = "left", left_on = "text", right_on = "sequence")

In [None]:
df.drop('index', 'subreddit', 'date')

In [None]:
df.to_csv("Zero-sentiment.csv", index = False)

In [None]:
# Example data (replace this with your actual data)
data = pd.read_csv("Zero-sentiment.csv")

# Create a DataFrame
df = pd.DataFrame(data)

In [None]:
# Calculate correlations for each gender and period
correlation_data = []

emotions = df['label_1'].unique()  # Assuming 'label_1' contains the emotion labels


#for emotion in emotions:
    for period in df['covid period'].unique():
        subset = df[(df['label_1'] == emotion) & (df['covid period'] == period)]
        female_scores = subset[subset['gender'] == 'female']['themeScores'].tolist()
        male_scores = subset[subset['gender'] == 'male']['themeScores'].tolist()

        min_length = min(len(female_scores), len(male_scores))

        correlation_sum = 0.0
        for i in range(min_length):
            correlation_sum += np.corrcoef(female_scores[i], male_scores[i])[0, 1]

        average_correlation = correlation_sum / min_length
        correlation_data.append({'Emotion': emotion, 'Period': period, 'Correlation': average_correlation})

correlation_df = pd.DataFrame(correlation_data)

In [None]:
# Convert themeScores to numerical arrays
df['themeScores'] = df['themeScores'].apply(eval)  # Convert the string representation to actual lists
df['themeScores'] = df['themeScores'].apply(np.array)  # Convert lists to numpy arrays

In [None]:
# Create a correlation plot
plt.figure(figsize=(12, 8))
sns.set(font_scale=1.2)
sns.barplot(data=correlation_df, x='Correlation', y='Emotion', hue='Period', dodge=True)
plt.title('Sentiment Correlation Plot between Male and Female Subreddit Users')
plt.xlabel('Correlation')
plt.ylabel('Emotion')
plt.legend(title='Period')
plt.grid(True)
plt.show()

In [None]:
# Create a DataFrame
# Example data (replace this with your actual data)
data = pd.read_csv("Zero-sentiment.csv")

# Create a DataFrame
df = pd.DataFrame(data)

# Create a heatmap using Seaborn
plt.figure(figsize=(10, 8))
sns.heatmap(df, cmap='coolwarm', annot=True, fmt=".2f", linewidths=0.5)
plt.title('Heatmap Example')
plt.show()


### Heatmap test

In [None]:
#Make one data frame for pre covid and one for post covid
sns.set()
data_1 = pd.read_csv"Zero-sentiment.csv")
data_1 = data_1.pivot("gender", "subreddit", "label_1")
ax = sns.heatmap(data_1)
plt.title("Heatmap Depression Data")
plt.show()
