Browse files

modify spider

  • Loading branch information...
1 parent 5d3bf12 commit 7fd33d1a2e4b56d8ce6b61bf474964612377ee2b @Yemsheng committed Jan 10, 2013
Showing with 1,730 additions and 21 deletions.
  1. BIN spider/.spider.cpp.swp
  2. +74 −21 spider/spider.cpp
  3. +149 −0 spider/spider.cpp~
  4. +1,507 −0 spider/www.dangdang.com2.html~
View
BIN spider/.spider.cpp.swp
Binary file not shown.
View
95 spider/spider.cpp
@@ -9,25 +9,40 @@
#include <string.h>
#include <unistd.h>
+
+const int URL_SIZE = 512;
+const int FILE_NAME_SIZE = 512;
+const int BUFFERSIZE = 1024;
+const int HTTP_MSG_BUFFER_SIZE = 1024;
+
+char *g_domain;
+char g_url[URL_SIZE];
+
+char *MakeHttpSendMsgContent(char *msg, const int msgSize);
+
int main(int argc, char *argv[])
{
if(argc!=2)
{
perror("argc != 2");
exit(1);
}
- char *domain = argv[1];
- char url[512];
- memset(url, 0, sizeof(url));
- sprintf(url, "http://");
- strcat(url, domain);
- strcat(url, "/");
+ g_domain = argv[1];
+ memset(g_url, 0, sizeof(g_url));
+ sprintf(g_url, "http://");
+ strcat(g_url, g_domain);
+ strcat(g_url, "/");
+
+ char saveFileName[FILE_NAME_SIZE];
+ memset(saveFileName, 0, sizeof(saveFileName));
+ sprintf(saveFileName, g_domain);
+ strcat(saveFileName, ".html");
struct hostent *h;
char *ipAddr = NULL;
- if ((h=gethostbyname(domain)) == NULL)
+ if ((h=gethostbyname(g_domain)) == NULL)
{
herror("gethostbyname");
exit(1);
@@ -58,19 +73,8 @@ int main(int argc, char *argv[])
perror("conncet failed\n");
}
- char msg[1024];
- memset(msg, 0, sizeof(msg));
- sprintf(msg,"GET ");
- strcat(msg, url);
- strcat(msg, " HTTP/1.1");
-
- strcat(msg, "\r\nHost: ");
- strcat(msg, domain);
-
- strcat(msg, "\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20100101 Firefox/15.0.1");
- strcat(msg, "\r\nAccept: */*");
- strcat(msg, "\r\nConnection: close\r\n\r\n");
- printf("%s\n",msg);
+ char msg[HTTP_MSG_BUFFER_SIZE];
+ MakeHttpSendMsgContent(msg, sizeof(msg));
int sendState = 0;
sendState = send(client_fd, msg, strlen(msg), 0);
@@ -81,16 +85,65 @@ int main(int argc, char *argv[])
}
int receiveLen = 0;
- char receiveBuffer[1024];
+ char receiveBuffer[BUFFERSIZE];
+
+ FILE *fout;
+ fout= fopen(saveFileName, "w");
while(true)
{
receiveLen = recv(client_fd, receiveBuffer,sizeof(receiveBuffer), 0);
printf("receive len = %d\n", receiveLen);
if(receiveLen<=0)
break;
+ fwrite(receiveBuffer, receiveLen, 1, fout);
}
+ fclose(fout);
close(client_fd);
return 0;
}
+
+char *MakeHttpSendMsgContent(char *msg, const int msgSize)
+{
+ if(msg==NULL||msgSize<=0)
+ return NULL;
+
+ memset(msg, 0, msgSize);
+ int leftSize = msgSize;
+
+ snprintf(msg, leftSize, "GET ");
+
+ leftSize = msgSize - strlen(msg);
+ if(leftSize>0)
+ //strncat(msg, "http://product.dangdang.com/product.aspx?product_id=1039656721", leftSize);
+ strncat(msg, g_url, leftSize);
+
+ leftSize = msgSize - strlen(msg);
+ if(leftSize>0)
+ strncat(msg, " HTTP/1.1", leftSize);
+
+ leftSize = msgSize - strlen(msg);
+ if(leftSize>0)
+ strncat(msg, "\r\nHost: ", leftSize);
+
+ leftSize = msgSize - strlen(msg);
+ if(leftSize>0)
+ strncat(msg, g_domain, leftSize);
+
+ leftSize = msgSize - strlen(msg);
+ if(leftSize>0)
+ strncat(msg, "\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20100101 Firefox/15.0.1", leftSize);
+
+ leftSize = msgSize - strlen(msg);
+ if(leftSize>0)
+ strncat(msg, "\r\nAccept: */*", leftSize);
+
+ leftSize = msgSize - strlen(msg);
+ if(leftSize>0)
+ strncat(msg, "\r\nConnection: close\r\n\r\n", leftSize);
+
+ printf("%s\ntoatl msgBuffer size = %d msglen = %d leftSize = %d\n",msg,msgSize, strlen(msg), leftSize);
+
+ return msg;
+}
View
149 spider/spider.cpp~
@@ -0,0 +1,149 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <netdb.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <string.h>
+#include <unistd.h>
+
+
+const int URL_SIZE = 512;
+const int FILE_NAME_SIZE = 512;
+const int BUFFERSIZE = 1024;
+const int HTTP_MSG_BUFFER_SIZE = 1024;
+
+char *g_domain;
+char g_url[URL_SIZE];
+
+char *MakeHttpSendMsgContent(char *msg, const int msgSize);
+
+int main(int argc, char *argv[])
+{
+ if(argc!=2)
+ {
+ perror("argc != 2");
+ exit(1);
+ }
+ g_domain = argv[1];
+ memset(g_url, 0, sizeof(g_url));
+ sprintf(g_url, "http://");
+ strcat(g_url, g_domain);
+ strcat(g_url, "/");
+
+ char saveFileName[FILE_NAME_SIZE];
+ memset(saveFileName, 0, sizeof(saveFileName));
+ sprintf(saveFileName, g_domain);
+ strcat(saveFileName, ".html");
+
+ struct hostent *h;
+ char *ipAddr = NULL;
+
+
+ if ((h=gethostbyname(g_domain)) == NULL)
+ {
+ herror("gethostbyname");
+ exit(1);
+ }
+
+ printf("Host name : %s\n", h->h_name);
+ ipAddr = inet_ntoa(*((struct in_addr*)h->h_addr));
+ printf("IP Address : %s\n", ipAddr);
+
+ int client_fd;
+ client_fd = socket(AF_INET, SOCK_STREAM, 0);
+ if(client_fd==-1)
+ {
+ perror("socket failed\n");
+ exit(1);
+ }
+
+ struct sockaddr_in server_addr;
+ memset(&server_addr, 0, sizeof(server_addr));
+ server_addr.sin_family = AF_INET;
+ server_addr.sin_port = htons(80);
+ server_addr.sin_addr = *((struct in_addr*)h->h_addr);
+
+ int connectState;
+ connectState = connect(client_fd, (struct sockaddr*)&server_addr, sizeof(server_addr));
+ if(connectState==-1)
+ {
+ perror("conncet failed\n");
+ }
+
+ char msg[HTTP_MSG_BUFFER_SIZE];
+ MakeHttpSendMsgContent(msg, sizeof(msg));
+
+ int sendState = 0;
+ sendState = send(client_fd, msg, strlen(msg), 0);
+ if(sendState==-1)
+ {
+ perror("send Error\n");
+ exit(1);
+ }
+
+ int receiveLen = 0;
+ char receiveBuffer[BUFFERSIZE];
+
+ FILE *fout;
+ fout= fopen(saveFileName, "w");
+ while(true)
+ {
+
+ receiveLen = recv(client_fd, receiveBuffer,sizeof(receiveBuffer), 0);
+ printf("receive len = %d\n", receiveLen);
+ if(receiveLen<=0)
+ break;
+ fwrite(receiveBuffer, receiveLen, 1, fout);
+ }
+ fclose(fout);
+ close(client_fd);
+
+ return 0;
+}
+
+char *MakeHttpSendMsgContent(char *msg, const int msgSize)
+{
+ if(msg==NULL||msgSize<=0)
+ return NULL;
+
+ memset(msg, 0, msgSize);
+ int leftSize = msgSize;
+
+ snprintf(msg, leftSize, "GET ");
+
+ leftSize = msgSize - strlen(msg);
+ if(leftSize>0)
+ strncat(msg, "http://product.dangdang.com/product.aspx?product_id=1039656721", leftSize);
+ //strncat(msg, g_url, leftSize);
+
+ leftSize = msgSize - strlen(msg);
+ if(leftSize>0)
+ strncat(msg, " HTTP/1.1", leftSize);
+
+ leftSize = msgSize - strlen(msg);
+ if(leftSize>0)
+ strncat(msg, "\r\nHost: ", leftSize);
+
+ leftSize = msgSize - strlen(msg);
+ if(leftSize>0)
+ strncat(msg, g_domain, leftSize);
+
+ leftSize = msgSize - strlen(msg);
+ if(leftSize>0)
+ strncat(msg, "\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20100101 Firefox/15.0.1", leftSize);
+
+ leftSize = msgSize - strlen(msg);
+ if(leftSize>0)
+ strncat(msg, "\r\nAccept: */*", leftSize);
+
+ leftSize = msgSize - strlen(msg);
+ if(leftSize>0)
+ strncat(msg, "\r\nConnection: close\r\n\r\n", leftSize);
+
+ printf("%s\ntoatl msgBuffer size = %d msglen = %d leftSize = %d\n",msg,msgSize, strlen(msg), leftSize);
+
+ return msg;
+}
View
1,507 spider/www.dangdang.com2.html~
1,507 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.

0 comments on commit 7fd33d1

Please sign in to comment.