In [1]:
import pandas as pd
from langchain.tools import BaseTool
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from langchain_experimental.agents import create_pandas_dataframe_agent
from langchain_groq import ChatGroq

In [2]:
sentiment_df = pd.read_csv('sentiment.csv')

In [3]:
sentiment_df.head()

Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,2,Just finished an amazing workout! 💪 ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19


In [4]:
def clean_text(text: str) -> str:
    """
    Cleans the input text by performing several steps:
    - Removing special characters and digits
    - Converting text to lowercase
    - Removing stopwords
    - Lemmatizing the words

    Args:
        text (str): Input text to clean.

    Returns:
        str: Cleaned text.
    """
    # Remove special characters and digits
    text = re.sub(r'[^A-Za-z\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back into a single string
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

In [5]:
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the input DataFrame by performing several common data cleaning steps:
    - Handling missing values
    - Correcting data types
    - Removing duplicates
    - Normalizing column names
    - Cleaning text data

    Args:
        df (pd.DataFrame): Input DataFrame to clean.

    Returns:
        pd.DataFrame: Cleaned DataFrame.
    """

    # Step 1: Remove any duplicate rows
    df = df.drop_duplicates()

    # Step 2: Handle missing values
    for column in df.columns:
        if df[column].dtype == 'object':
            # Fill missing text data with an empty string
            df[column] = df[column].fillna('')
        else:
            # Fill missing numeric data with the mean
            df[column] = df[column].fillna(df[column].mean())

    # Step 3: Correct data types
    for column in df.columns:
        if df[column].dtype == 'object':
            try:
                df[column] = pd.to_datetime(df[column])
            except (ValueError, TypeError):
                pass  # If conversion to datetime fails, we keep the original dtype

    # Step 4: Normalize column names
    df.columns = df.columns.str.lower().str.replace(' ', '_')

    # Step 5: Clean text data
    for column in df.select_dtypes(include=['object']).columns:
        df[column] = df[column].apply(clean_text)
    
    # Step 6: (Optional) Remove outliers
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        filter = (df[col] >= Q1 - 1.5 * IQR) & (df[col] <= Q3 + 1.5 * IQR)
        df = df.loc[filter]
    
    return df

In [6]:
cleaned_data = clean_data(sentiment_df)

cleaned_data

  df[column] = pd.to_datetime(df[column])
  df[column] = pd.to_datetime(df[column])
  df[column] = pd.to_datetime(df[column])
  df[column] = pd.to_datetime(df[column])
  df[column] = pd.to_datetime(df[column])
  df[column] = pd.to_datetime(df[column])


Unnamed: 0,unnamed:_0,text,sentiment,timestamp,user,platform,hashtags,retweets,likes,country,year,month,day,hour
0,0,enjoying beautiful day park,positive,2023-01-15 12:30:00,user,twitter,nature park,15.0,30.0,usa,2023,1,15,12
2,2,finished amazing workout,positive,2023-01-15 15:45:00,fitnessfan,instagram,fitness workout,20.0,40.0,usa,2023,1,15,15
3,3,excited upcoming weekend getaway,positive,2023-01-15 18:20:00,adventurex,facebook,travel adventure,8.0,15.0,uk,2023,1,15,18
4,4,trying new recipe dinner tonight,neutral,2023-01-15 19:55:00,chefcook,instagram,cooking food,12.0,25.0,australia,2023,1,15,19
5,5,feeling grateful little thing life,positive,2023-01-16 09:10:00,gratitudenow,twitter,gratitude positivevibes,25.0,50.0,india,2023,1,16,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
727,727,collaborating science project received recogni...,happy,2017-08-18 18:20:00,scienceprojectsuccesshighschool,facebook,sciencefairwinner highschoolscience,20.0,39.0,uk,2017,8,18,18
728,728,attending surprise birthday party organized fr...,happy,2018-06-22 14:15:00,birthdaypartyjoyhighschool,instagram,surprisecelebration highschoolfriendship,25.0,48.0,usa,2018,6,22,14
729,729,successfully fundraising school charity initia...,happy,2019-04-05 17:30:00,charityfundraisingtriumphhighschool,twitter,communitygiving highschoolphilanthropy,22.0,42.0,canada,2019,4,5,17
730,730,participating multicultural festival celebrati...,happy,2020-02-29 20:45:00,multiculturalfestivaljoyhighschool,facebook,culturalcelebration highschoolunity,21.0,43.0,uk,2020,2,29,20


In [None]:
class DataFrameTool(BaseTool):
    name = "DataFrameTool"
    description = "A tool that takes a Pandas DataFrame, performs operations and returns a new DataFrame."

    def _run(self, df: pd.DataFrame) -> pd.DataFrame:
        cleaned_data = clean_data(df)
        return cleaned_data
    
    def _call(self, df):
        return self._run(df)

In [None]:
llm_model = ChatGroq(temperature=0, groq_api_key='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', model_name="mixtral-8x7b-32768")

agent_with_tool = create_pandas_dataframe_agent(llm=llm_model, df=sentiment_df,verbose=True, agent_type='tool-calling', extra_tools=[DataFrameTool()])

agent_without_tool = create_pandas_dataframe_agent(llm=llm_model, df=sentiment_df,verbose=True)

In [None]:
agent_without_tool.invoke('Take the dataset and preprocess it using the tools available to you')

In [None]:
agent_with_tool.invoke('Take the dataset and preprocess it using the tools available to you')