In [None]:
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('final_data.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
non_null_df_counts = df.count()
non_null_df_counts.plot(kind= 'bar')

In [None]:
df.describe()

In [None]:
#DATE PROCESSING
df.rename(columns = { 'date': 'date1'}, inplace = True )
df['date2'] = pd.to_datetime(df['date1'] , format='%Y-%m-%d %H:%M:%S.%f %z') 
df['date'] = df['date2'].dt.date
df['time'] = df['date2'].dt.time
df = df.drop(['date1','date2'], axis = 1)

In [None]:
#DATE ANALYSIS

min_date = df['date'].min()
max_date = df['date'].max()
print("Minimum Date:" , min_date)
print("Maximum Date:" ,max_date)



date_freq = df['date'].value_counts().sort_index()
plt.figure(figsize=(12, 6))
date_freq.plot(kind='line', marker='o')
plt.title('Frequencies between the dates: February 17 - December 17' )
plt.xlabel('Date')
plt.ylabel('Frequency')
plt.grid(True)
plt.xticks(rotation=45)
plt.show()

In [None]:
# CUSTOMER_UNIQUE_ID ANALYSIS   -  HER MUSTERI TOPLAM KAC MESAJ ATMIS
customer_entry_counts = df['customer_unique_id'].value_counts()

entry_count_by_customer = customer_entry_counts.value_counts().sort_index()
entry_count_by_customer = entry_count_by_customer[entry_count_by_customer.index <= 30]


plt.figure(figsize=(10, 6))
entry_count_by_customer.plot(kind='bar')
plt.title('Distribution of Number of Customers for Different Message Counts')
plt.xlabel('Number of Messages per Customer')
plt.ylabel('Customer Count')
plt.xticks(rotation=0)
plt.show()

In [None]:
# CUSTOMER_UNIQUE_ID ANALYSIS   -  HER FARKLI MUSTERI KAC KERE GIRIS YAPMIS

customer_date_counts = df.groupby(['customer_unique_id', 'date']).size().reset_index(name='count')

customer_total_entries = customer_date_counts.groupby('customer_unique_id')['count'].sum().reset_index(name='total_entries')

customer_count_by_entries = customer_total_entries['total_entries'].value_counts().sort_index()

customer_count_by_entries = customer_count_by_entries[customer_count_by_entries.index <= 30]

plt.figure(figsize=(10, 6))
customer_count_by_entries.plot(kind='bar')
plt.title('Customer Number X Number of Usage of Chatbot')
plt.xlabel('Number of Usage of Chatbot')
plt.ylabel('Customer Number')
plt.xticks(rotation=90)
plt.show()

In [None]:
# MESSAGE COLUMN ANALYSIS

message_counts = df['fixed_message'].value_counts()

top_5_messages = message_counts.head(5)

plt.figure(figsize=(10, 6))
top_5_messages.plot(kind='bar')
plt.title('Top 5 Most Sent Messages and Their Frequencies')
plt.xlabel('Message')
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.show()

In [None]:
print("DataFrame Info before deleting xwelcomex messages")
df.info()

In [None]:
df = df[df['fixed_message'] != 'xwelcomex']
print("DataFrame Info after deleting xwelcomex messages")
df.info()
print(1059645 - 488645, " rows eliminated.")

In [None]:
import re
pattern_style = r'<p>(.*?)</p>'
pattern_message = r'"message":\s*"([^"]*)"'

def extract_message(cell):
    if '<style' in cell:
        # strings contain `<style>` 
        cell = re.sub(r'<style.*?</style>', '', cell, flags=re.DOTALL)
        match = re.search(pattern_style, cell, re.DOTALL)
    else:
        # strings do not contain`<style>` 
        match = re.search(pattern_message, cell)
    
    if match:
        return match.group(1)
    return "NON-APPLICABLE"  # If no match, return None

df['fixed_answer'] = df['answer'].apply(extract_message)

print(df.head(10))

In [None]:
import matplotlib.pyplot as plt
import textwrap

# FIXED ANSWER COLUMN ANALYSIS
fixed_answer_counts = df['fixed_answer'].value_counts()

top_10_answers = fixed_answer_counts.head(10)

def wrap_labels(labels, width=15):
    return ['\n'.join(textwrap.wrap(label, width=width)) for label in labels]

plt.figure(figsize=(10, 6))
top_10_answers.plot(kind='bar')

wrapped_labels = wrap_labels(top_10_answers.index, width=8)
plt.xticks(range(len(top_10_answers)), wrapped_labels, rotation=0)

plt.title('Most given 10 answers')
plt.xlabel('Answer')
plt.ylabel('Frequency')
plt.show()

In [None]:
# CATEGORY NAME AND SUPER CATEGORY ANALYSIS

super_category_dict = {}
grouped = df.groupby('super_category')

for super_category, group in grouped:
    unique_category_names = group['category_name'].unique()
    super_category_dict[super_category] = unique_category_names

for super_category, category_names in super_category_dict.items():
    print(f"Super Category: {super_category}, Category Names: {', '.join(category_names)}")

In [None]:
# Group by customer_unique_id and date, then concatenate messages and answers
#conversation_df = df.groupby(['customer_unique_id', 'date']).apply(
#    lambda x: ' '.join('\n     CUSTOMER ------>  ' + x['fixed_message'] + '\n      FIBOT  --------->     ' + x['fixed_answer'])
#).reset_index(name='conversation')

#conversation_df.head()


row_count_df = df.groupby(['customer_unique_id', 'date']).size().reset_index(name='row_count')

# Group by 'customer_unique_id' and 'date' and aggregate 'fixed_message' and 'fixed_answer'
conversation_df = df.groupby(['customer_unique_id', 'date']).apply(
    lambda x: ' '.join('\n     CUSTOMER ------>  ' + x['fixed_message'] + '\n      FIBOT  --------->     ' + x['fixed_answer'])
).reset_index(name='conversation')

# Merge conversation_df with row_count_df on 'customer_unique_id' and 'date'
conversation_df = pd.merge(conversation_df, row_count_df, on=['customer_unique_id', 'date'], how='left')

conversation_df.head()

In [None]:
import matplotlib.pyplot as plt

# getting the frequencies of different row_counts
# Count the frequencies of each unique value in the 'row_count' column
counts = conversation_df['row_count'].value_counts()

# Sort the counts by index (row counts)
counts = counts.sort_index()

plt.figure(figsize=(10, 6))
plt.bar(counts.index, counts.values)

plt.xlabel('Message Number')
plt.ylabel('Frequency')
plt.title('Frequencies of Row Counts')

plt.xlim(0, 20)
plt.show()

In [None]:
# discarding the ones with row_count<3
conversation_df = conversation_df[conversation_df['row_count'] >=3]
conversation_df.info()

In [None]:
conversation_df['label'] = None
conversation_df.to_csv('conversations.csv',index=False)

In [None]:
df_sampled_4_percent = conversation_df.sample(frac=0.04, random_state=42)
df_sampled_96_percent = conversation_df.drop(df_sampled_4_percent.index)

In [None]:
# Load dataset
df2 = pd.read_csv('data_2000_rows.csv') 
labels = pd.read_csv('labels_from_1_to_1000.csv')
df2 = df2.drop('label',axis=1)
df2 = df2.drop('Unnamed: 0',axis=1)
df2 = df2.drop('customer_unique_id',axis=1)
df2 = df2.drop('date',axis=1)
df2 = df2.drop('row_count',axis=1)
df2 = df2.head(1000)
labels = labels.rename( columns = {'9' : 'label'})
df2 = df2.rename( columns = {'conversation' : 'text'})
df2['label'] = labels['label'].copy()
print(df2.info())
print(df2.head())

In [None]:
# df2.replace(-1, 0, inplace=True) # We tried both scenario
# Count the unique values in the 'label' column
label_counts = df2['label'].value_counts()

plt.figure(figsize=(10, 6))  
plt.bar(label_counts.index, label_counts.values)

plt.xlabel('Labels')
plt.ylabel('Frequency')
plt.title('Frequency of Each Label in Dialogs')

# Show the plot
plt.show()