<a href="https://colab.research.google.com/github/arifaygun/CustomerEye/blob/main/Copy_of_Trustpilot_Report_(Template).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pycountry

Collecting pycountry
  Downloading pycountry-23.12.11-py3-none-any.whl (6.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycountry
Successfully installed pycountry-23.12.11


In [2]:
import os
import pandas as pd
import pycountry
from datetime import datetime
from transformers import pipeline
from google.colab import drive

In [3]:
# Mount Google Drive
drive.mount('/content/drive/')
%cd /content/drive/My Drive/Customereye Reports/

Mounted at /content/drive/
/content/drive/My Drive/Customereye Reports


In [4]:
def preprocess_df(df, reply_text):

    # Fill NaN values in 'Reviews Count' with 0, extract numeric values, and convert to integers
    df['Reviews Count'] = df['Reviews Count'].fillna(0).astype(str).str.extract('(\d+)', expand=False).fillna(0).astype(int)

    # Replace the specified text in 'Reply Date' column
    df['Reply Date'] = df['Reply Date'].str.replace(reply_text, '').str.strip()

    # Convert 'Experience Date', 'Review Date', and 'Reply Date' to datetime
    date_columns = ['Experience Date', 'Review Date', 'Reply Date']
    df[date_columns] = df[date_columns].apply(pd.to_datetime, errors='coerce')

    # Drop rows with NaN values in 'Experience Date', 'Review Date', or 'Reply Date'
    df.dropna(subset=date_columns, inplace=True)

    # Extract 'Year' from 'Review Date' and create a new column
    df['Year'] = pd.to_datetime(df['Review Date']).dt.year.fillna(0).astype(int)

    # Concatenate 'Review Title' and 'Review Text' into a new 'Reviews' column
    df['Reviews'] = df['Review Title'].astype(str) + ' ' + df['Review Text'].astype(str)

    # Rename 'Reply Text' column to 'Replies'
    df.rename(columns={'Reply Text': 'Replies', 'Country Code': 'Country'}, inplace=True)

    # Add a new column with country names
    df['Countries'] = df['Country'].apply(lambda code: pycountry.countries.get(alpha_2=code).name if pycountry.countries.get(alpha_2=code) else None)

    # Calculate response time between 'Experience Date' and 'Review Date' in days
    df['Exp to Review'] = (df['Review Date'] - df['Experience Date']).dt.total_seconds() / 86400

    # Calculate response time between 'Review Date' and 'Reply Date' in days
    df['Review to Reply'] = (df['Reply Date'] - df['Review Date']).dt.total_seconds() / 86400

    # Round the values to the nearest integer and convert to int
    df[['Exp to Review', 'Review to Reply']] = df[['Exp to Review', 'Review to Reply']].round(0).astype(int)

    # Drop the unnecessary 'Reviewer Name', 'Review Title' and 'Review Text','Country' columns
    df.drop(['Reviewer Name', 'Review Title', 'Review Text','Country'], axis=1, inplace=True)

    # Rearrange the columns
    df = df[['Year', 'Experience Date', 'Review Date', 'Reply Date','Exp to Review',
             'Review to Reply','Rating', 'Countries', 'Reviews', 'Replies']]

    return df


In [5]:
# Function to perform sentiment analysis with error handling and text truncation
def perform_sentiment_analysis(df):
    # Load the sentiment analysis pipeline
    pipe = pipeline("text-classification", model="nlptown/bert-base-multilingual-uncased-sentiment")

    # Create a new column to store the sentiment analysis result
    df['Sentiment'] = ""

    # Define the maximum sequence length supported by the model
    max_seq_length = pipe.model.config.max_position_embeddings

    # Function to truncate text and perform sentiment analysis
    def analyze_sentiment(text):
        truncated_text = text[:max_seq_length - 2]  # Truncate text to fit within max sequence length
        try:
            # Perform sentiment analysis and extract sentiment label
            sentiment_label = pipe(truncated_text)[0]['label']
            numeric_sentiment = int(sentiment_label.split()[0])
            return numeric_sentiment
        except Exception as e:
            print(f"Error analyzing sentiment: {e}")
            return None  # Return None if sentiment analysis fails

    # Apply sentiment analysis function to each review text
    df['Sentiment'] = df['Reviews'].apply(analyze_sentiment)

    # Drop rows with None values in Sentiment column (indicating error during sentiment analysis)
    df.dropna(subset=['Sentiment'], inplace=True)

    # Convert Sentiment column dtype to integer
    df['Sentiment'] = df['Sentiment'].astype(int)

    return df

In [6]:
# Function to save DataFrame to CSV with timestamp and custom filename
def save_to_csv(df, original_filename):
    # Get current date and time
    now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    # Extract original filename without extension
    filename_without_extension = os.path.splitext(original_filename)[0]

    # Construct CSV file name with timestamp and original filename
    output_csv_filename = f"{filename_without_extension}_sentiment_analysis_{now}.csv"

    # Save the DataFrame to a CSV file
    df.to_csv(output_csv_filename, index=False)

    print("Output DataFrame saved to:", output_csv_filename)


In [None]:
# Read the input CSV file
df = pd.read_csv('advance_america.csv')

# Preprocess the DataFrame
df = preprocess_df(df, 'Reply from Advance America')

# Perform sentiment analysis
df = perform_sentiment_analysis(df)

# Save the DataFrame to a CSV file
save_to_csv(df, "advance_america.csv")

In [8]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40630 entries, 418 to 66478
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Year             40630 non-null  int64         
 1   Experience Date  40630 non-null  datetime64[ns]
 2   Review Date      40630 non-null  datetime64[ns]
 3   Reply Date       40630 non-null  datetime64[ns]
 4   Exp to Review    40630 non-null  int64         
 5   Review to Reply  40630 non-null  int64         
 6   Rating           40630 non-null  int64         
 7   Countries        40630 non-null  object        
 8   Reviews          40630 non-null  object        
 9   Replies          40630 non-null  object        
dtypes: datetime64[ns](3), int64(4), object(3)
memory usage: 3.4+ MB
