In [1]:
import pandas as pd
import re
import polars as pl
import jupyter_black

jupyter_black.load()


In [2]:
def read_and_process_files():
    # Read content polluters profile data
    polluters_profile = pd.read_table(
        "Datasets/content_polluters.txt",
        header=None,
        names=[
            "UserID",
            "CreatedAt",
            "CollectedAt",
            "NumberOfFollowings",
            "NumberOfFollowers",
            "NumberOfTweets",
            "LengthOfScreenName",
            "LengthOfDescriptionInUserProfile",
        ],
    )
    """Here we are going to create all the new features that are related to the dataset polluters and we will do the same later for the other one too."""
    polluters_profile["FollowingFollowersRatio"] = (
        polluters_profile["NumberOfFollowings"] / polluters_profile["NumberOfFollowers"]
    )
    # Convert date columns to datetime
    polluters_profile["CreatedAt"] = pd.to_datetime(polluters_profile["CreatedAt"])
    polluters_profile["CollectedAt"] = pd.to_datetime(polluters_profile["CollectedAt"])

    # Calculate account lifetime in days or Durée de vie du compte
    polluters_profile["AccountLifetime"] = (
        polluters_profile["CollectedAt"] - polluters_profile["CreatedAt"]
    ).dt.days

    # Calculate average tweets per day
    polluters_profile["AverageTweetsPerDay"] = (
        polluters_profile["NumberOfTweets"] / polluters_profile["AccountLifetime"]
    )

    # ================================= End of content_polluters==========================================

    # Read content polluters followings data
    polluters_followings = pd.read_table(
        "Datasets/content_polluters_followings.txt",
        header=None,
        names=["UserID", "SeriesOfNumberOfFollowings"],
    )

    # ================================= End of content_polluters_followings==========================================

    # Read content polluters tweets data
    polluters_tweets = pd.read_table(
        "Datasets/content_polluters_tweets.txt",
        header=None,
        names=["UserID", "TweetID", "Tweet", "CreatedAt"],
    )

    # ===================================reading the legitimate datasets=========================================
    # Read legitimate users profile data
    legitimate_profile = pd.read_table(
        "Datasets/legitimate_users.txt",
        header=None,
        names=[
            "UserID",
            "CreatedAt",
            "CollectedAt",
            "NumberOfFollowings",
            "NumberOfFollowers",
            "NumberOfTweets",
            "LengthOfScreenName",
            "LengthOfDescriptionInUserProfile",
        ],
    )
    legitimate_profile["FollowingFollowersRatio"] = (
        legitimate_profile["NumberOfFollowings"]
        / legitimate_profile["NumberOfFollowers"]
    )

    # Convert date columns to datetime
    legitimate_profile["CreatedAt"] = pd.to_datetime(legitimate_profile["CreatedAt"])
    legitimate_profile["CollectedAt"] = pd.to_datetime(
        legitimate_profile["CollectedAt"]
    )

    # Calculate account lifetime in days or Durée de vie du compte
    legitimate_profile["AccountLifetime"] = (
        legitimate_profile["CollectedAt"] - legitimate_profile["CreatedAt"]
    ).dt.days

    # Calculate average tweets per day
    legitimate_profile["AverageTweetsPerDay"] = (
        legitimate_profile["NumberOfTweets"] / legitimate_profile["AccountLifetime"]
    )

    # Read legitimate users followings data
    legitimate_followings = pd.read_table(
        "Datasets/legitimate_users_followings.txt",
        header=None,
        names=["UserID", "SeriesOfNumberOfFollowings"],
    )
    # legitimate_followings["SeriesOfNumberOfFollowings"] = legitimate_followings["SeriesOfNumberOfFollowings"].apply(lambda x: list(map(int, x.split(","))))

    # Read legitimate users tweets data
    legitimate_tweets = pd.read_table(
        "Datasets/legitimate_users_tweets.txt",
        header=None,
        names=["UserID", "TweetID", "Tweet", "CreatedAt"],
    )

    # Return all dataframes
    return {
        "polluters_profile": polluters_profile,
        "polluters_followings": polluters_followings,
        "polluters_tweets": polluters_tweets,
        "legitimate_profile": legitimate_profile,
        "legitimate_followings": legitimate_followings,
        "legitimate_tweets": legitimate_tweets,
    }


# Reading all the files and preprocess some of them.
data = read_and_process_files()
# data["polluters_profile"].head()


# Separation de data
df_polluters = data["polluters_profile"]
df_polluters_tweets = data["polluters_tweets"]
df_polluters_followings = data["polluters_followings"]

# ===============================Legitimate data=========================================

df_legitimate = data["legitimate_profile"]
df_legitimate_tweets = data["legitimate_tweets"]
df_legitimate_followings = data["legitimate_followings"]


# ====================================== Creating the attribut restants=========================
# Fonction pour compter les URLs dans un tweet
def count_urls(text):
    urls = re.findall(
        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
        str(text),
    )
    return len(urls)


# Compter le nombre d'URL dans chaque tweet
df_polluters_tweets["URL_Count"] = df_polluters_tweets["Tweet"].apply(count_urls)
df_polluters_tweets["Proportion_URL"] = df_polluters_tweets[
    "URL_Count"
] / df_polluters_tweets["Tweet"].notna().astype(int)

# Apply URL count function to legitimate tweets
df_legitimate_tweets["URL_Count"] = df_legitimate_tweets["Tweet"].apply(count_urls)
df_legitimate_tweets["Proportion_URL"] = df_legitimate_tweets[
    "URL_Count"
] / df_legitimate_tweets["Tweet"].notna().astype(int)


# time difference between tweets
# Calcul du Temps moyen et maximal entre deux tweets consécutifs


def calcul_time_diff(df):
    # Convertir la colonne CreatedAt en datetime
    df["CreatedAt"] = pd.to_datetime(df["CreatedAt"])

    # Trier les tweets par utilisateur et par date
    df = df.sort_values(by=["UserID", "CreatedAt"])

    # Calculer la différence de temps entre deux tweets consécutifs
    df["Time_Diff"] = df.groupby("UserID")["CreatedAt"].diff()

    # Convertir la différence de temps en secondes
    df["Time_Diff_Seconds"] = df["Time_Diff"].dt.total_seconds()

    # Calculer le temps moyen et maximal entre deux tweets consécutifs par utilisateur
    time_stats = (
        df.groupby("UserID")["Time_Diff_Seconds"].agg(["mean", "max"]).reset_index()
    )
    time_stats.columns = [
        "UserID",
        "Mean_Time_Between_Tweets",
        "Max_Time_Between_Tweets",
    ]
    return time_stats


def calculate_at_proportion(df):
    """
    Calculate the proportion of '@' mentions in the 'Tweet' column of a dataframe.

    Parameters:
        df (pd.DataFrame): The input dataframe containing a 'Tweet' column.

    Returns:
        pd.DataFrame: The modified dataframe with new columns:
                     - 'count_AT': Number of '@' mentions in each tweet.
                     - 'Tweet_Length': Length of each tweet.
                     - 'Proportion_AT': Proportion of '@' mentions in each tweet.
    """
    # Ensure the 'Tweet' column is treated as a string
    df["Tweet"] = df["Tweet"].astype(str)

    # Count the number of '@' mentions in each tweet
    df["count_AT"] = df["Tweet"].apply(lambda x: x.count("@"))

    # Calculate the length of each tweet
    df["Tweet_Length"] = df["Tweet"].apply(len)

    # Calculate the proportion of '@' mentions
    df["Proportion_AT"] = df.apply(
        lambda row: (
            row["count_AT"] / row["Tweet_Length"] if row["Tweet_Length"] != 0 else 0
        ),
        axis=1,
    )

    return df


# ==============================porportion at ==================================
df_legitimate_tweets = calculate_at_proportion(df_legitimate_tweets)
df_polluters_tweets = calculate_at_proportion(df_polluters_tweets)

# Pour les donnees polluters
time_diff_polluters = calcul_time_diff(df_polluters_tweets)

df_polluters = df_polluters.merge(time_diff_polluters, on="UserID", how="left")
# Pour les donnees Legitimate
time_diff_legitimate = calcul_time_diff(df_legitimate_tweets)

df_legitimate = df_legitimate.merge(time_diff_legitimate, on="UserID", how="left")


# pour les donnees legitimate et polluters on va selectionner les colonnes qu'on veut seulement ici

df_no_duplicates_polluters_tweets = df_polluters_tweets.drop_duplicates(
    subset="UserID", keep="first"
)

df_no_duplicates_legitimate_tweets = df_legitimate_tweets.drop_duplicates(
    subset="UserID", keep="first"
)



df_polluters_tweets = df_no_duplicates_polluters_tweets[
    ["URL_Count", "Proportion_URL", "count_AT", "Tweet_Length", "Proportion_AT"]
]
df_legitimate_tweets = df_no_duplicates_legitimate_tweets[
    ["URL_Count", "Proportion_URL", "count_AT", "Tweet_Length", "Proportion_AT"]
]

# Reinitialisation des indexes pour pouvoir combinner les donnes
df_legitimate_tweets = df_legitimate_tweets.reset_index(drop=True)
df_legitimate = df_legitimate.reset_index(drop=True)

df_polluters = df_polluters.reset_index(drop=True)
df_polluters_tweets = df_polluters_tweets.reset_index(drop=True)

# Now we are gonna delete some features that we don't need on both files legitimate and polluters
df_polluters = df_polluters.drop(["UserID", "CreatedAt", "CollectedAt"], axis=1)
df_legitimate = df_legitimate.drop(["UserID", "CreatedAt", "CollectedAt"], axis=1)

#  Now we are gonna combine the file based on polluters and legitimates
df_polluters_final = pd.concat([df_polluters, df_polluters_tweets], axis=1)

df_legitimate_final = pd.concat([df_legitimate, df_legitimate_tweets], axis=1)

# Add a 'Class' column to indicate polluters (1) and legitimate users (0)
df_polluters_final["Class"] = 1  # Polluters
df_legitimate_final["Class"] = 0  # Legitimate users

# Combining the two datasets into a single one and save the result on a csv format
df_final = pd.concat(
    [df_polluters_final, df_legitimate_final], axis=0, ignore_index=True
)
df_final.to_csv("Combined_datasets.csv", index=False)

In [3]:
df_polluters_final.head()

Unnamed: 0,NumberOfFollowings,NumberOfFollowers,NumberOfTweets,LengthOfScreenName,LengthOfDescriptionInUserProfile,FollowingFollowersRatio,AccountLifetime,AverageTweetsPerDay,Mean_Time_Between_Tweets,Max_Time_Between_Tweets,URL_Count,Proportion_URL,count_AT,Tweet_Length,Proportion_AT,Class
0,3269,3071,861,8,132,1.064474,1217,0.707477,29533.18593,264985.0,1.0,1.0,0.0,134.0,0.0,1
1,1949,793,226,9,134,2.457755,1329,0.170053,263780.025381,7649040.0,1.0,1.0,0.0,38.0,0.0,1
2,1119,9644,38674,12,158,0.116031,1272,30.404088,1705.964286,43858.0,0.0,0.0,1.0,93.0,0.010753,1
3,2174,6029,12718,11,121,0.36059,1105,11.509502,7551.668342,74830.0,0.0,0.0,0.0,102.0,0.0,1
4,7731,7029,873,6,70,1.099872,1105,0.790045,13131.241206,354362.0,0.0,0.0,0.0,42.0,0.0,1


In [4]:
df_legitimate_final.head()

Unnamed: 0,NumberOfFollowings,NumberOfFollowers,NumberOfTweets,LengthOfScreenName,LengthOfDescriptionInUserProfile,FollowingFollowersRatio,AccountLifetime,AverageTweetsPerDay,Mean_Time_Between_Tweets,Max_Time_Between_Tweets,URL_Count,Proportion_URL,count_AT,Tweet_Length,Proportion_AT,Class
0,510,350,3265,10,34,1.457143,1226,2.663132,10839.075377,91280.0,0.0,0.0,0.0,60.0,0.0,0
1,304,443,4405,7,156,0.68623,1219,3.613618,8288.050251,73558.0,0.0,0.0,1.0,136.0,0.007353,0
2,45,73,725,6,37,0.616438,1219,0.59475,39585.422111,364917.0,0.0,0.0,0.0,111.0,0.0,0
3,211,230,211,7,0,0.917391,1226,0.172104,400437.045226,7693877.0,0.0,0.0,1.0,39.0,0.025641,0
4,7346,7244,11438,8,97,1.014081,1213,9.429514,3655.570707,86675.0,1.0,1.0,0.0,48.0,0.0,0


In [5]:
# # Add a 'Class' column to indicate polluters (1) and legitimate users (0)
# df_polluters_final["Class"] = 1  # Polluters
# df_legitimate_final["Class"] = 0  # Legitimate users

In [9]:
# df_final = pd.concat(
#     [df_polluters_final, df_legitimate_final], axis=0, ignore_index=True
# )
# df_final.head()

In [10]:
df_final.shape

(41499, 16)

In [8]:
idkjlkaj

NameError: name 'idkjlkaj' is not defined

In [None]:
df_legitimate_tweets = df_legitimate_tweets.reset_index(drop=True)
df_legitimate = df_legitimate.reset_index(drop=True)

df_polluters = df_polluters.reset_index(drop=True)
df_polluters_tweets = df_polluters_tweets.reset_index(drop=True)

# Now we are gonna delete some features that we don't need on both files legitimate and polluters
df_polluters = df_polluters.drop(["UserID", "CreatedAt", "CollectedAt"], axis=1)
df_legitimate = df_legitimate.drop(["UserID", "CreatedAt", "CollectedAt"], axis=1)

#  Now we are gonna combine the file based on polluters and legitimates
df_polluters_final = pd.concat([df_polluters, df_polluters_tweets], axis=1)

df_legitimate_final = pd.concat([df_legitimate, df_legitimate_tweets], axis=1)

In [None]:
df_polluters_tweets.head()

In [None]:
df_polluters_tweets.shape, df_legitimate_tweets.shape

In [None]:
df_polluters = df_polluters.drop(["UserID", "CreatedAt", "CollectedAt"], axis=1)
df_polluters.head()

In [None]:
df_polluters_final = pd.concat([df_polluters, df_polluters_tweets], axis=1)
df_polluters_final.head()

In [None]:
df_polluters_final.isna().sum()

In [None]:
df_polluters_final.shape

In [None]:
df1 = pd.DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"])
df2 = pd.DataFrame({"B": [4, 5]}, index=["b", "c"])

# Left join (default)
result = df1.join(df2)
print(df1)

In [None]:
Back up

In [None]:
def calculate_at_proportion(df):
    """
    Calculate the proportion of '@' mentions in the 'Tweet' column of a dataframe.

    Parameters:
        df (pd.DataFrame): The input dataframe containing a 'Tweet' column.

    Returns:
        pd.DataFrame: The modified dataframe with new columns:
                     - 'count_AT': Number of '@' mentions in each tweet.
                     - 'Tweet_Length': Length of each tweet.
                     - 'Proportion_AT': Proportion of '@' mentions in each tweet.
    """
    # Ensure the 'Tweet' column is treated as a string
    df["Tweet"] = df["Tweet"].astype(str)

    # Count the number of '@' mentions in each tweet
    df["count_AT"] = df["Tweet"].apply(lambda x: x.count("@"))

    # Calculate the length of each tweet
    df["Tweet_Length"] = df["Tweet"].apply(len)

    # Calculate the proportion of '@' mentions
    df["Proportion_AT"] = df.apply(
        lambda row: (
            row["count_AT"] / row["Tweet_Length"] if row["Tweet_Length"] != 0 else 0
        ),
        axis=1,
    )

    return df

In [None]:
df_legitimate_tweets = calculate_at_proportion(df_legitimate_tweets)
df_legitimate_tweets.head()

In [None]:
jdjhjd

In [None]:
df_no_duplicates_legitimate_tweets

In [None]:
df_polluters.shape

In [None]:
df_no_duplicates_legitimate_tweets.sort_values(by="UserID")

In [None]:
22_223 - 19_251

In [None]:
df_no_duplicates_legitimate_tweets.shape

In [None]:
df_legitimate.isna().sum()

In [None]:
df_polluters.shape, df_polluters_tweets.shape

In [None]:
df_polluters_tweets.head()

In [None]:
df_polluters_tweets.shape

In [None]:
tx = df_polluters_tweets.drop_duplicates()
tx.shape

In [None]:
df_no_duplicates = df_polluters_tweets.drop_duplicates(subset="UserID", keep="first")

In [11]:
df_no_duplicates.shape

NameError: name 'df_no_duplicates' is not defined

In [None]:
df_polluters.shape

In [None]:
df_legitimate_tweets.head()

In [None]:
djlkjalk

In [None]:
# def combine_files(df_1, df_2):
#     df_1 = df_1.drop(["UserID", "CreatedAt", "CollectedAt"], axis=1).drop_duplicates()
#     df_2 = df_2.drop(["UserID"], axis=1).drop_duplicates()

#     df_final = pd.concat([df_1, df_2], axis=1)
#     df_final = df_final.drop(["TweetID", "Tweet", "CreatedAt"], axis=1)
#     return df_final


# df_polutters_final = combine_files(df_polluters, df_polluters_tweets)
# # df_polluters_final.head()

In [None]:
# df_polutters_final.head()

In [None]:
# df_polutters_final.isna().sum()

In [None]:
# df_polutters_final.sample(10)

In [None]:
df_polluters = df_polluters.drop(["UserID", "CreatedAt", "CollectedAt"], axis=1)
df_polluters.head()

In [None]:
df_polluters_tweets.drop(["UserID"], axis=1, inplace=True)

In [None]:
df_polluters_tweets.head()

In [None]:
df_final = pd.concat([df_polluters, df_polluters_tweets], axis=1)
df_polluters_final = df_final.drop(["TweetID", "Tweet", "CreatedAt"], axis=1)
df_polluters_final.head()

In [None]:
# def combine_files(df_1, df_2):
#     df_1 = df_1.drop(["UserID", "CreatedAt", "CollectedAt"], axis=1)
#     df_2 = df_2.drop(["UserID"], axis=1, inplace=True)
#     df_final = pd.concat([df_1, df_2], axis=1)
#     df_polluters_final = df_final.drop(["TweetID", "Tweet", "CreatedAt"], axis=1)
#     return df


# df_polutters_final = combine_files(df_polluters, df_polluters_tweets)
# df_polluters_final.head()

In [None]:
df_legitimate.head()

In [None]:
df_legitimate.shape

In [None]:
time_diff_polluters = calcul_time_diff(df_polluters_tweets)

time_diff_polluters.head()

In [None]:
df_polluters_tweets.head()

In [None]:
df_legitimate_tweets.head()

In [None]:
df_polluters_tweets.shape, df_legitimate_tweets.shape

In [None]:
df_polluters_tweets.shape

In [None]:
df_polluters_tweets.isna().sum()

In [None]:
857 + 19710

In [None]:
df_polluters_tweets = df_polluters_tweets.drop_duplicates()

In [None]:
df_polluters.shape

In [None]:
df_polluters_followings.drop_duplicates().shape

In [None]:
df_polluters.drop_duplicates().shape

In [None]:
df_polluters_tweets.shape

In [None]:
df_polluters.head()

In [None]:
df_polluters_tweets.head()

In [None]:
# def calcul_time_diff(df):
#     # Convertir la colonne CreatedAt en datetime
#     df["CreatedAt"] = pd.to_datetime(df["CreatedAt"])

#     # Trier les tweets par utilisateur et par date
#     df = df.sort_values(by=["UserID", "CreatedAt"])

#     # Calculer la différence de temps entre deux tweets consécutifs
#     df["Time_Diff"] = df.groupby("UserID")["CreatedAt"].diff()

#     # Convertir la différence de temps en secondes
#     # df["Time_Diff_Seconds"] = df["Time_Diff"].dt.total_seconds()

#     # Calculer le temps moyen et maximal entre deux tweets consécutifs par utilisateur
#     time_stats = (
#         df.groupby("UserID")["Time_Diff_Seconds"].agg(["mean", "max"]).reset_index()
#     )
#     time_stats.columns = [
#         "UserID",
#         "Mean_Time_Between_Tweets",
#         "Max_Time_Between_Tweets",
#     ]
#     return time_stats


# # Afficher les résultats
# time_diff_polluters = calcul_time_diff(df_polluters_tweets)

# df_polluters_tweets = df_polluters_tweets.merge(
#     time_diff_polluters, on="UserID", how="left"
# )
# df_polluters_tweets.head()

In [None]:
df_polluters.head()

In [None]:
df_polluters_tweets.head()

In [None]:
df_polluters_tweets["count_AT"]

In [None]:
df_polluters_tweets["Tweet"] = df_polluters_tweets["Tweet"].astype("str")
df_polluters_tweets["count_AT"] = df_polluters_tweets["Tweet"].apply(
    lambda x: x.count("@")
)

df_polluters_tweets["Tweet_Length"] = df_polluters_tweets["Tweet"].apply(len)

df_polluters_tweets["Proportion_AT"] = df_polluters_tweets.apply(
    lambda row: (
        row["count_AT"] / row["Tweet_Length"] if row["Tweet_Length"] != 0 else 0
    ),
    axis=1,
)

In [None]:
df_polluters_tweets["Tweet_Length"] = df_polluters_tweets["Tweet"].apply(len)

In [None]:
df_polluters_tweets["Proportion_AT"] = df_polluters_tweets.apply(
    lambda row: (
        row["count_AT"] / row["Tweet_Length"] if row["Tweet_Length"] != 0 else 0
    ),
    axis=1,
)

In [None]:
df_polluters_tweets.head()

In [None]:
df_polluters_tweets["Proportion_AT"].sample(20)

In [None]:
def calculate_at_proportion(df):
    """
    Calculate the proportion of '@' mentions in the 'Tweet' column of a dataframe.

    Parameters:
        df (pd.DataFrame): The input dataframe containing a 'Tweet' column.

    Returns:
        pd.DataFrame: The modified dataframe with new columns:
                     - 'count_AT': Number of '@' mentions in each tweet.
                     - 'Tweet_Length': Length of each tweet.
                     - 'Proportion_AT': Proportion of '@' mentions in each tweet.
    """
    # Ensure the 'Tweet' column is treated as a string
    df["Tweet"] = df["Tweet"].astype(str)

    # Count the number of '@' mentions in each tweet
    df["count_AT"] = df["Tweet"].apply(lambda x: x.count("@"))

    # Calculate the length of each tweet
    df["Tweet_Length"] = df["Tweet"].apply(len)

    # Calculate the proportion of '@' mentions
    df["Proportion_AT"] = df.apply(
        lambda row: (
            row["count_AT"] / row["Tweet_Length"] if row["Tweet_Length"] != 0 else 0
        ),
        axis=1,
    )

    return df

In [None]:
df_legitimate_tweets = calculate_at_proportion(df_legitimate_tweets)
df_legitimate_tweets.head()