<a href="https://colab.research.google.com/github/canerskrc/LLM-GPT/blob/main/Scikit_LLM_E_Mail_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn_llm import LLMClassifier
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import openai
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
import pickle
import os.path

openai.api_key = 'YOUR_OPENAI_API_KEY'

nltk.download('stopwords')
nltk.download('punkt')

def authenticate_google():
    SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
    creds = None
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)
    service = build('gmail', 'v1', credentials=creds)
    return service

#Gmail API
def fetch_emails(service, user_id='me', label_ids=None):
    emails = []
    results = service.users().messages().list(userId=user_id, labelIds=label_ids).execute()
    messages = results.get('messages', [])
    for message in messages:
        msg = service.users().messages().get(userId=user_id, id=message['id']).execute()
        payload = msg['payload']
        headers = payload['headers']
        subject = [i['value'] for i in headers if i["name"] == "Subject"]
        body = ''
        if 'parts' in payload:
            for part in payload['parts']:
                if part['mimeType'] == 'text/plain':
                    body = part['body']['data']
                    break
        emails.append({'subject': subject[0] if subject else '', 'body': body})
    return emails

#Preprocessing
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

#Main Function
def main():
    # Google API
    service = authenticate_google()

    emails = fetch_emails(service)
    df = pd.DataFrame(emails)

    X_train, X_test, y_train, y_test = train_test_split(df['body'], df['subject'], test_size=0.2, random_state=42)

    X_train = X_train.apply(preprocess_text)
    X_test = X_test.apply(preprocess_text)

    #Vectorizer
    vectorizer = TfidfVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # LLMClassifier
    llm_classifier = LLMClassifier(openai.Completion.create)
    llm_classifier.fit(X_train_vec, y_train)
    y_pred = llm_classifier.predict(X_test_vec)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

    #New E-Mail
    new_email = "Müsait olduğun bir saat aralığında kahve içelim mi?"
    new_email_processed = preprocess_text(new_email)
    new_email_vec = vectorizer.transform([new_email_processed])
    predicted_category = llm_classifier.predict(new_email_vec)

    print(f"Yeni e-posta kategorisi: {predicted_category[0]}")

if __name__ == "__main__":
    main()