<a href="https://colab.research.google.com/github/avisser79/Twitter-Sentiment-Analysis/blob/main/Europe_language_score_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from textblob import TextBlob
import string
import textstat

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Read the DataFrame from the CSV file
df = pd.read_csv('Europe_sentiment_analysis_3.csv')

# Fill NaN values in the likes_count column with a default value of -1
df['likes_count'].fillna(-1, inplace=True)

# Convert the likes_count column to float
df['likes_count'] = pd.to_numeric(df['likes_count'], errors='coerce')

# Fill NaN values in the reply_count column with a default value of -1
df['reply_count'].fillna(-1, inplace=True)

# Convert the reply_count column to float
df['reply_count'] = pd.to_numeric(df['reply_count'], errors='coerce')

# Fill NaN values in the quote_count column with a default value of '-1'
df['quote_count'].fillna('-1', inplace=True)

# Convert the quote_count column to object (string)
df['quote_count'] = df['quote_count'].astype(str)

# Fill NaN values in the retweet_count column with a default value of -1
df['retweet_count'].fillna(-1, inplace=True)

# Convert the retweet_count column to integer
df['retweet_count'] = df['retweet_count'].astype(str)

# Function to calculate cohesion score
def calculate_cohesion(tweet):
    if isinstance(tweet, str):
        # Tokenize the tweet into individual words
        tokens = word_tokenize(tweet.lower())

        # Remove punctuation marks
        tokens = [token for token in tokens if token not in string.punctuation]

        # Calculate the frequency distribution of tokens
        freq_dist = FreqDist(tokens)

        # Calculate the cohesion score based on the number of unique words
        cohesion_score = len(freq_dist) / len(tokens)

        return cohesion_score
    else:
        return 0

# Function to calculate coherence score
def calculate_coherence(tweet, keywords):
    if isinstance(tweet, str):
        # Tokenize the tweet into individual words
        tokens = word_tokenize(tweet.lower())

        # Remove punctuation marks
        tokens = [token for token in tokens if token not in string.punctuation]

        # Calculate the frequency distribution of tokens
        freq_dist = FreqDist(tokens)

        # Calculate the coherence score based on the presence of keywords
        coherence_score = sum(freq_dist[keyword] for keyword in keywords) / len(tokens)

        return coherence_score
    else:
        return 0

# Function to fetch keywords from a tweet
def fetch_keywords(tweet):
    if isinstance(tweet, float):
        return []  # Return an empty list for float values
    else:
        # Perform part-of-speech tagging on the tweet
        blob = TextBlob(tweet)
        pos_tags = blob.tags

        # Extract nouns and adjectives as keywords
        keywords = [word for word, pos in pos_tags if pos.startswith("N") or pos.startswith("J")]

        return keywords

# Read the DataFrame from the CSV file
df = pd.read_csv('Europe_sentiment_analysis_3.csv')

# Calculate cohesion and coherence for each tweet in the dataframe
df['cohesion'] = df['cleaned_tweet'].apply(calculate_cohesion)
df['keywords'] = df['cleaned_tweet'].apply(fetch_keywords)
df['coherence_score'] = df.apply(lambda x: calculate_coherence(x['cleaned_tweet'], x['keywords']), axis=1)

def calculate_flesch_kincaid(tweet):
    try:
        return textstat.flesch_kincaid_grade(tweet)
    except TypeError:
        return 0

def calculate_gunning_fog(tweet):
    if isinstance(tweet, str):
        return textstat.gunning_fog(tweet)
    else:
        return 0

# Calculate Flesch-Kincaid Grade Level
df['Flesch_Kincaid'] = df['cleaned_tweet'].apply(calculate_flesch_kincaid)

# Calculate Gunning_Fog
df['Gunning_Fog'] = df['cleaned_tweet'].apply(calculate_gunning_fog)

# Calculate the z-scores for the relevant columns
columns_to_normalize = ['cohesion', 'coherence_score', 'Flesch_Kincaid', 'Gunning_Fog']
for column in columns_to_normalize:
    df[f'z_{column}'] = (df[column] - df[column].mean()) / df[column].std()

# Calculate the language score based on the weights
df['language_score'] = 0.4 * df['z_coherence_score'] - 0.3 * df['z_Flesch_Kincaid'] + 0.2 * df['z_cohesion'] - 0.1 * df['z_Gunning_Fog']

df.to_csv('Europe_sentiment_analysis_with_language_score.csv', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
  df = pd.read_csv('Europe_sentiment_analysis_3.csv')
  df = pd.read_csv('Europe_sentiment_analysis_3.csv')


In [2]:
!pip install textstat

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.14.0 textstat-0.7.3
