In [None]:
import pandas as pd
import re
from collections import Counter


In [None]:
df = pd.read_csv('/content/drive/MyDrive/sonuc2.csv')

In [None]:
def keyword_frequencies(text, keywords):
    words = re.findall(r'\b\w+\b', text.lower())
    word_counts = Counter(words)
    total_words = sum(word_counts.values())

    # Her bir anahtar kelimenin yüzdesini hesapla
    frequencies = {
        keyword: (word_counts.get(keyword, 0) / total_words) * 100
        if total_words > 0 else 0
        for keyword in keywords
    }
    return frequencies

In [None]:
def char_frequencies(text, characters):
    chars = re.findall(r'.', text)  # Tüm karakterleri bul
    char_counts = Counter(chars)
    total_chars = sum(char_counts.values())

    frequencies = {
        char: (char_counts.get(char, 0) / total_chars) * 100 if total_chars > 0 else 0
        for char in characters
    }
    return frequencies

In [None]:
def capital_run_length(text):
    capital_runs = re.findall(r'[A-ZÇŞĞÜİÖ]+', text)
    avg_run_length = sum(len(run) for run in capital_runs) / len(capital_runs) if capital_runs else 0
    longest_run = max(len(run) for run in capital_runs) if capital_runs else 0
    total_capitals = sum(len(run) for run in capital_runs)

    return {
        "average_run_length": avg_run_length,
        "longest_run": longest_run,
        "total_capitals": total_capitals
    }

In [None]:
def extract_currency_features(text):
    # Para birimi ve rakam analizleri
    money_terms = re.findall(r'(?:\d+\s?(?:tl|₺|usd|\$))', text.lower())
    total_money_mentions = len(money_terms)

    # Tüm sayıları bul
    all_numbers = re.findall(r'\d+', text)
    total_numbers = len(all_numbers)

    # Toplam sayıyı hesapla
    total_elements = len(re.findall(r'\b\w+\b', text))  # Kelimeler ve rakamlar

    # Yüzde hesaplamaları
    money_percentage = (total_money_mentions / total_elements) * 100 if total_elements > 0 else 0
    number_percentage = (total_numbers / total_elements) * 100 if total_elements > 0 else 0
    return {
        "para_bahisleri": total_money_mentions,
        "para_yüzdesi": money_percentage,
        "sayı_bahisleri": total_numbers,
        "sayı_yüzdesi": number_percentage,
    }


In [None]:
def process_emails_and_extract_features(df, keywords, characters):
    # Yeni özellikler eklemek için boş bir liste oluştur
    new_features = []

    # Her bir e-posta için işlemi gerçekleştir
    for email in df['text']:
        # Anahtar kelime frekansı özelliklerini çıkar
        keyword_features = keyword_frequencies(email, keywords)

        # Karakter frekansı özelliklerini çıkar
        char_features = char_frequencies(email, characters)

        # Büyük harf koşusu özelliklerini çıkar
        capital_features = capital_run_length(email)

        # Para birimi ve sayı analiz özelliklerini çıkar
        currency_features = extract_currency_features(email)

        # Tüm özellikleri birleştir
        email_features = {**keyword_features, **char_features,
                          **capital_features, **currency_features}
        new_features.append(email_features)

    # Yeni özellikleri DataFrame'e ekle
    new_features_df = pd.DataFrame(new_features)
    result_df = pd.concat([df, new_features_df], axis=1)

    return result_df

In [None]:
# Anahtar kelimeler ve karakterler tanımla
keywords = ["tl","icin","karne","com","indirim","kent","üzeri","nakit","aday","aksam","sms","neyse","lira",
            "son","yılbaşı","gönder","http","makale","ozel","tr","evet","benzer","aninda","eglenceli","emek",
            "telefon","yil","içerik","kampanya","kadar","detaylar","yasaklı","bilgi","yaz","hemen","çözüm",
            "ders","zaman","ortak","bip","kullan","twitter","bugün","ucretsiz","bonus","açıklama","yeni",
            "pahalı","kolay","sadece","hediye","bilginize","lütfen","ret","deprem","firsat","cok","spor",
            "konaklama","size","gonder","ekim","almak","iptal","ozel","takip","internet"]
characters = [';', '(', '[', '!', '*', '#','%']
# Özellik çıkarımı işlemini yap
result_df = process_emails_and_extract_features(df, keywords, characters)
# Sonuçları yeni bir CSV dosyasına kaydet
result_df.to_csv('eemails.csv', index=False)

In [None]:
df = pd.read_csv('/content/eemails.csv')
df.head(25)

Unnamed: 0.1,Unnamed: 0,text,sonuc,tl,icin,karne,com,indirim,kent,üzeri,...,*,#,%,average_run_length,longest_run,total_capitals,para_bahisleri,para_yüzdesi,sayı_bahisleri,sayı_yüzdesi
0,0,125 lira,norm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0.0,1,50.0
1,1,Baskanin aksam toplantısi fenaymis :),norm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1,1,0,0.0,0,0.0
2,2,Bilal yalçnlara ne zaman gidiyoruz?,norm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1,1,0,0.0,0,0.0
3,3,"BiP ile mesajlarimi aninda, daha eglenceli gon...",spam,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1,5,0,0.0,0,0.0
4,4,DIGITURKTEN FIRSAT! SiZE OZEL YIL SONUNA KADAR...,spam,3.703704,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.655172,11,106,1,3.703704,3,11.111111
5,5,İyi ya dokuz eylül iyidir arş.gör falan kovala ;),norm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1,1,0,0.0,0,0.0
6,6,İyiyim teşekkürler oturuyoruz nazarda arkadaşl...,norm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1,1,0,0.0,0,0.0
7,7,Kapatamıyorun ayarlara girmem lazım :),norm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1,1,0,0.0,0,0.0
8,8,Menüye girsem görünür mü acaba ??,norm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1,1,0,0.0,0,0.0
9,9,Önemli değil hocam iyi akşamlar ;),norm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1,1,0,0.0,0,0.0


In [None]:
# # CSV dosyasını masaüstüne kaydet
# df.to_csv(r'C:\Users\aysen\OneDrive\eemails.csv', index=False)

In [None]:
# Yeni veri setini yükle (emails_with_features.csv)
emails_df = pd.read_csv('eemails.csv')
# 'sonuc' sütunundaki değerleri sayısal değerlere dönüştür (norm -> 0, spam -> 1)
emails_df['sonuc'] = emails_df['sonuc'].apply(lambda x: 0 if x == 'norm' else 1 if x == 'spam' else None)
# Silinmesini istediğiniz sütunları tanımlayın
columns_to_drop = ['Unnamed: 0', 'text']
# Sütunları kaldır (kalıcı olarak)
emails_df.drop(columns_to_drop, axis=1, inplace=True)
# Silme sonrası kalan sütunları kontrol et
print(emails_df.head())
# Sonuçları yeni bir dosyaya kaydet
emails_df.to_csv('new_emails.csv', index=False)
print("Yeni CSV dosyası 'new_emails.csv' olarak kaydedildi.")


   sonuc        tl  icin  karne  com  indirim  kent  üzeri  nakit  aday  ...  \
0      0  0.000000   0.0    0.0  0.0      0.0   0.0    0.0    0.0   0.0  ...   
1      0  0.000000   0.0    0.0  0.0      0.0   0.0    0.0    0.0   0.0  ...   
2      0  0.000000   0.0    0.0  0.0      0.0   0.0    0.0    0.0   0.0  ...   
3      1  0.000000   0.0    0.0  0.0      0.0   0.0    0.0    0.0   0.0  ...   
4      1  3.703704   0.0    0.0  0.0      0.0   0.0    0.0    0.0   0.0  ...   

     *    #    %  average_run_length  longest_run  total_capitals  \
0  0.0  0.0  0.0            0.000000            0               0   
1  0.0  0.0  0.0            1.000000            1               1   
2  0.0  0.0  0.0            1.000000            1               1   
3  0.0  0.0  0.0            1.000000            1               5   
4  0.0  0.0  0.0            3.655172           11             106   

   para_bahisleri  para_yüzdesi  sayı_bahisleri  sayı_yüzdesi  
0               0      0.000000         

In [None]:
df = pd.read_csv('/content/new_emails.csv')
df.head(25)

Unnamed: 0,sonuc,tl,icin,karne,com,indirim,kent,üzeri,nakit,aday,...,*,#,%,average_run_length,longest_run,total_capitals,para_bahisleri,para_yüzdesi,sayı_bahisleri,sayı_yüzdesi
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0.0,1,50.0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1,1,0,0.0,0,0.0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1,1,0,0.0,0,0.0
3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1,5,0,0.0,0,0.0
4,1,3.703704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.655172,11,106,1,3.703704,3,11.111111
5,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1,1,0,0.0,0,0.0
6,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1,1,0,0.0,0,0.0
7,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1,1,0,0.0,0,0.0
8,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1,1,0,0.0,0,0.0
9,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1,1,0,0.0,0,0.0
