In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# -*- coding: utf-8 -*-
"""
Created on Wed May 22 15:09:11 2019

@author: Admin
"""

import numpy as np  
import pandas as pd
import re  
import nltk  
import pickle  
from nltk.corpus import stopwords 
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,auc
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from xgboost import XGBClassifier
#import scikitplot as skplt
from simple_salesforce import Salesforce
import logging
from datetime import datetime
import time
import json
import sys
from langdetect import detect
from googletrans import Translator
from textblob import TextBlob

start = time.time()

with open("pythonconfig - Data.json") as json_data_file:
    data = json.load(json_data_file)

logger=logging.getLogger(__name__)

if data["loggingLevel"]=="debug":
    logger.setLevel(logging.DEBUG)
elif data["loggingLevel"]=="info":
    logger.setLevel(logging.INFO)
elif data["loggingLevel"]=="warning":
    logger.setLevel(logging.WARNING)
elif data["loggingLevel"]=="error":
    logger.setLevel(logging.ERROR)
elif data["loggingLevel"]=="critical":
    logger.setLevel(logging.CRITICAL)

#log format
formatter=logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

#create file handler
file_handler=logging.FileHandler("pythonCodeMyLearnData{0}.txt".format(datetime.now().strftime("_%d-%B-%Y_%H-%M")))
stream_handler=logging.StreamHandler()

#add formatter to file_handler
file_handler.setFormatter(formatter)
stream_handler.setFormatter(formatter)

#add file_handler to logger
logger.addHandler(file_handler)
logger.addHandler(stream_handler)


try :
    chachedWords = stopwords.words('english')

    #### Load the models and transformers ####
    count_vect = pickle.load(open('count_vect.sav','rb'))
    tfidf_transformer = pickle.load(open('tfidf.sav', 'rb'))
    model = pickle.load(open('xgb_model.sav', 'rb'))

    #### Preprocessing steps ####
    TAG_RE = re.compile(r'<[^>]+>')


    def lang_trans(s):
        en_blob = TextBlob(s)
        if en_blob.detect_language() == "en":
            return s
        else:
            try:
                return "".join(en_blob.translate(to='en'))
            except:
                try:
                    translator = Translator()
                    return translator.translate(s, dest='en').text
                except:
                    return s

    def remove_tags(text):
        return TAG_RE.sub('', text)

    def remove_emails(text):
        pattern = re.compile(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+")
        return pattern.sub('',text)

    def clean_data(text):
        text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        text=text.replace("\n"," ")
        text=remove_tags(text)
        text=remove_emails(text)
        text=re.sub(r"\s+", " ", text)
        text=re.sub(r'[^\w\s]',"",text)
        text=re.sub(r'[0-9]+', '', text)
        text=text.replace("_","")
        return text.lower()

    porter_stemmer=nltk.PorterStemmer()
    def Tokenizer(str_input):
        words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
        words = [porter_stemmer.stem(word) for word in words if word not in chachedWords]
        return ' '.join([word for word in words if len(word) > 2])

    #### Read the new dataset ####
    sf = Salesforce(username=data['salesforce']['username'], password=data['salesforce']['password'], security_token=data['salesforce']['security_token'])
    query="Select Id, CaseNumber, Subject, Description from Case where OwnerId='' and Status='New'"
    df=pd.DataFrame(sf.bulk.Case.query(query))

    nrows, ncols = df.shape
    logger.info("No. of records from MyLearn Support Queue:{0}".format(nrows))

    if df.empty:
        raise exception("No Records!!")

    df['Subject']=df['Subject'].apply(lambda x: str(x).lower())
    df = df[~df.Subject.str.contains('out of the office|ooo|out of office|payment|remit|merge',regex=True)]

    df['Description'] = df['Description'].apply(lambda x: str(x).lower())
    df = df[~df.Description.str.contains('out of the office|ooo|out of office|payment|remit|merge', regex=True)]

    df.set_index('CaseNumber',inplace=True)

    #### Cleana the data ####
    df['Text'] = df['Description'].apply(str) + " " + df['Subject'].apply(str)
    df['Text'] = df['Text'].apply(lambda x: str(x).lower().replace('\n',' '))
    df['Text'] = df['Text'].apply(lang_trans)
    df['Clean_Text'] = df['Text'].apply(clean_data)
    X = df['Clean_Text']

    X = X.apply(Tokenizer)

    #### Transform the data ####
    X_counts = count_vect.transform(X).toarray()
    features = tfidf_transformer.transform(X_counts).toarray()

    #### Predict using loaded model ####
    prob= model.predict_proba(features)[:,1]
    pred = [1 if i>0.55 else 0 for i in prob]

    df['Predicted Labels']=pred
    df['Output Probability']=prob

    df.reset_index(inplace=True)



    final_df = df[df['Predicted Labels']==1][['CaseNumber','Subject', 'Description','Clean_Text','Output Probability']].copy(deep=True)
    final_df.to_excel('input_data.xlsx',index=False)

    nrows, ncols=final_df.shape
    logger.info("No. of Output predicted records:{0}".format(nrows))

    ###########################
    logger.info("<<<INPUT FILE SUCCESSFULLY CREATED>>>")

except:
    logger.error("<<<Error>>>:\n {0}".format(sys.exc_info()))

finally:
    logger.info("*************************")
    logger.info("--- %s seconds ---" % (time.time() - start))




ModuleNotFoundError: No module named 'simple_salesforce'