In [1]:
import os
import sys
import pandas as pd

In [2]:
data = os.path.abspath(os.path.join('.','data', 'data.csv'))

In [3]:
df= pd.read_csv(data)

In [8]:
df.columns

Index(['Channel', 'AuthorID', 'Author', 'Date', 'Content', 'Attachments',
       'Reactions'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,Channel,AuthorID,Author,Date,Content,Attachments,Reactions
0,Ocean Protocol - CORE-TECH - 🌊┃builders [10887...,369700858742571008,mantisclone,03/29/2023 1:02 AM,gm 👋 :Water_Wave: :Oceancopy:,,
1,Ocean Protocol - CORE-TECH - 🌊┃builders [10887...,843831770062913568,white_rider_,03/29/2023 4:37 PM,Good morning to all the @🧑‍💻Builder 🌊,,
2,Ocean Protocol - CORE-TECH - 🌊┃builders [10887...,563434444321587202,Maki#4920,03/29/2023 4:37 PM,**@White Rider just advanced to level 4!**,https://cdn.discordapp.com/attachments/1088751...,powerup (1)
3,Ocean Protocol - CORE-TECH - 🌊┃builders [10887...,194817764236460034,robinlehmann,03/29/2023 4:40 PM,https://tenor.com/view/hello-there-baby-yoda-m...,,babyyodasoup (1)
4,Ocean Protocol - CORE-TECH - 🌊┃builders [10887...,387401160656683034,birususama,03/29/2023 4:40 PM,Good morning everyone 😀,,🌅 (1)


In [9]:
# Filter out bot users
non_bot_df = df[df['Author'] != 'Bot']

# Calculate metrics for each user
user_activity = non_bot_df.groupby('AuthorID').agg({
    'Content': 'count',  # Number of messages
    'Attachments': 'sum',  # Total attachments sent
    'Reactions': 'sum'  # Total reactions received
})


In [10]:
# Count words and characters in messages
user_activity['Words'] = non_bot_df.groupby('AuthorID')['Content'].apply(lambda x: x.str.split().str.len().sum())
user_activity['Characters'] = non_bot_df.groupby('AuthorID')['Content'].apply(lambda x: x.str.len().sum())


In [11]:
# Rank users by activity metrics
user_activity = user_activity.sort_values(by='Content', ascending=False)

# Analyze time of day/week for peak user activity
df['Date'] = pd.to_datetime(df['Date'])
df['Hour'] = df['Date'].dt.hour
df['Day_of_week'] = df['Date'].dt.dayofweek

In [12]:
# Aggregate user activity by hour and day of the week
activity_by_hour = df.groupby(['AuthorID', 'Hour']).size().unstack(fill_value=0)
activity_by_day = df.groupby(['AuthorID', 'Day_of_week']).size().unstack(fill_value=0)


In [13]:
# Classify users into categories based on activity levels
def classify_users(row):
    if row['Content'] >= high_threshold:
        return 'Highly Active'
    elif row['Content'] >= moderate_threshold:
        return 'Moderately Active'
    else:
        return 'Infrequent'


In [14]:
# Define activity thresholds for classification
high_threshold = user_activity['Content'].quantile(0.9)
moderate_threshold = user_activity['Content'].quantile(0.5)

# Apply classification function to create a new column
user_activity['Activity_Category'] = user_activity.apply(classify_users, axis=1)


In [15]:
# Display user ranking and activity classification
print("User Ranking and Activity Classification:")
print(user_activity)

User Ranking and Activity Classification:
                     Content  \
AuthorID                       
159985870458322944     17878   
456226577798135808      5825   
717363377269244015      1199   
739132787499597824      1116   
344879785173843970      1048   
...                      ...   
1009089128266207243        0   
1009088927715561514        0   
1009088718260404255        0   
1009085096881573899        0   
1009089345375981740        0   

                                                           Attachments  \
AuthorID                                                                 
159985870458322944   https://cdn.discordapp.com/attachments/7692097...   
456226577798135808   https://cdn.discordapp.com/attachments/6129533...   
717363377269244015   https://cdn.discordapp.com/attachments/7692097...   
739132787499597824   https://cdn.discordapp.com/attachments/9939544...   
344879785173843970   https://cdn.discordapp.com/attachments/7206317...   
...                    

In [38]:
user_activity

Unnamed: 0,Author,Content,Attachments,Reactions
0,! GD┘áShaibi#3385,Joined the server.,1,
1,! Reko .#6689,Joined the server.,1,
2,! SΓêåM#2947,Joined the server.,1,
3,! ajjaxx.xp#8310,Joined the server.,1,
4,! Arii#5937,Joined the server.,1,
...,...,...,...,...
9020,🚨❓ FAQ | queries#8727,Joined the server.,1,
9021,🚨❓ FAQ | queries#9354,Joined the server.,1,
9022,🚨❓FAQ | queries#3602,Joined the server.,1,
9023,🛎Support|FAQ#9746,Joined the server.,1,


In [23]:
user_activity['Message_Count'] = user_activity['Content'].apply(lambda x: len(x.split()))

AttributeError: 'int' object has no attribute 'split'

In [31]:
user_activity.head(5)

Unnamed: 0_level_0,Content,Attachments,Reactions,Words,Characters,Activity_Category
AuthorID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
159985870458322944,17878,https://cdn.discordapp.com/attachments/7692097...,"❤️ (1)🤙 (1)🏆 (3)🥶 (2),👀 (2),🧑‍🌾 (2)🤝 (2)🔥 (5)🔥...",1220023.0,9780681.0,Highly Active
456226577798135808,5825,https://cdn.discordapp.com/attachments/6129533...,➕ (2)❤️ (2)mildpanicintensifies (3)🔥 (1)💪 (2)👍...,23443.0,138702.0,Highly Active
717363377269244015,1199,https://cdn.discordapp.com/attachments/7692097...,"👍 (1)👍 (1)👍 (2)🥶 (1),😆 (1),😂 (1)☀️ (1)🔥 (2)🚀 (...",37092.0,219677.0,Highly Active
739132787499597824,1116,https://cdn.discordapp.com/attachments/9939544...,"👍 (1)👍 (1)✅ (2)👍 (1)👍 (1)🙌 (2),👍 (1)🚀 (2),🥰 (1...",24590.0,162308.0,Highly Active
344879785173843970,1048,https://cdn.discordapp.com/attachments/7206317...,"🥳 (1)🥳 (2)😎 (2)🚀 (3)👏 (2)🔥 (1)🔥 (2),👏 (2)🔥 (1)...",36385.0,271745.0,Highly Active


In [27]:
# Assuming you have calculated user_activity DataFrame

# Select top five users based on activity metrics
top_users = user_activity.head(5)
bottom_users = user_activity.tail(5)

# Display the top five users
print(top_users)


                    Content  \
AuthorID                      
159985870458322944    17878   
456226577798135808     5825   
717363377269244015     1199   
739132787499597824     1116   
344879785173843970     1048   

                                                          Attachments  \
AuthorID                                                                
159985870458322944  https://cdn.discordapp.com/attachments/7692097...   
456226577798135808  https://cdn.discordapp.com/attachments/6129533...   
717363377269244015  https://cdn.discordapp.com/attachments/7692097...   
739132787499597824  https://cdn.discordapp.com/attachments/9939544...   
344879785173843970  https://cdn.discordapp.com/attachments/7206317...   

                                                            Reactions  \
AuthorID                                                                
159985870458322944  ❤️ (1)🤙 (1)🏆 (3)🥶 (2),👀 (2),🧑‍🌾 (2)🤝 (2)🔥 (5)🔥...   
456226577798135808  ➕ (2)❤️ (2)mildpanicintensifies

In [28]:
print(bottom_users)

                     Content Attachments Reactions  Words  Characters  \
AuthorID                                                                
1009089128266207243        0           0         0    0.0         0.0   
1009088927715561514        0           0         0    0.0         0.0   
1009088718260404255        0           0         0    0.0         0.0   
1009085096881573899        0           0         0    0.0         0.0   
1009089345375981740        0           0         0    0.0         0.0   

                    Activity_Category  
AuthorID                               
1009089128266207243        Infrequent  
1009088927715561514        Infrequent  
1009088718260404255        Infrequent  
1009085096881573899        Infrequent  
1009089345375981740        Infrequent  


In [42]:
# Time Analysis
#user_activity.columns = ['Author', 'Message_Count', 'Words', 'Attachments', 'Reactions_Received']

df2['Date'] = pd.to_datetime(df2['Date'])  # Convert 'Date' column to datetime
df2['Hour'] = df2['Date'].dt.hour           # Extract hour of the day
df2['Day_of_Week'] = df2['Date'].dt.day_name()  # Extract day of the week


In [43]:
# Group by user and hour/day_of_week and count messages
user_activity_by_hour = df2.groupby(['Author', 'Hour']).size().reset_index(name='Hourly_Activity_Count')
user_activity_by_day = df2.groupby(['Author', 'Day_of_Week']).size().reset_index(name='Daily_Activity_Count')

# User Categorization (example criteria)
# Define thresholds for each category
high_activity_threshold = 100  # Define your threshold based on your dataset
moderate_activity_threshold = 50

In [44]:
user_activity['Message_Count'].iloc[0]

3

In [45]:
# Categorize users based on activity levels
user_activity['Category'] = 'Casual User'  # Default category
user_activity.loc[user_activity['Message_Count'] > high_activity_threshold, 'Category'] = 'Active Contributor'
user_activity.loc[(user_activity['Message_Count'] > moderate_activity_threshold) & (user_activity['Message_Count'] <= high_activity_threshold), 'Category'] = 'Moderate Contributor'

In [46]:
user_activity.head()


Unnamed: 0,Author,Content,Attachments,Reactions,Message_Count,Category
0,! GD┘áShaibi#3385,Joined the server.,1,,3,Casual User
1,! Reko .#6689,Joined the server.,1,,3,Casual User
2,! SΓêåM#2947,Joined the server.,1,,3,Casual User
3,! ajjaxx.xp#8310,Joined the server.,1,,3,Casual User
4,! Arii#5937,Joined the server.,1,,3,Casual User


In [52]:
user_activity_by_hour.sort_values(by='Hourly_Activity_Count',ascending=False).head()


Unnamed: 0,Author,Hour,Hourly_Activity_Count
1598,GitHub,20,1761
1599,GitHub,21,1699
1597,GitHub,19,1607
2104,MEE6#4876,21,1555
1596,GitHub,18,1554


In [50]:
user_activity_by_day.sort_values(by='Daily_Activity_Count',ascending=False).head()

Unnamed: 0,Author,Day_of_Week,Daily_Activity_Count
1838,MEE6#4876,Tuesday,4536
1378,GitHub,Wednesday,3222
1376,GitHub,Thursday,3176
1833,MEE6#4876,Friday,2981
1834,MEE6#4876,Monday,2963
